# **Install datasets**

In [None]:
!pip install transformers
!pip install datasets
!pip install accelerate
!pip install torch
!pip install evaluate



In [None]:
# !pip install datasets evaluate transformers accelerate torch

In [None]:
from transformers import AlbertForQuestionAnswering, AutoTokenizer, DefaultDataCollator, AdamW, create_optimizer, Trainer, TrainingArguments
from datasets import load_dataset
import numpy as np
from tqdm.auto import tqdm
import collections
import evaluate

import torch

# **Load Dataset**

In [None]:
dataset = load_dataset("rajpurkar/squad_v2")

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})

In [None]:
len_train = len(dataset["train"])
len_train

130319

In [None]:
print("id: ", dataset["train"][0]["id"])
print("title: ", dataset["train"][0]["title"])
print("Context: ", dataset["train"][0]["context"])
print("Question: ", dataset["train"][0]["question"])
print("Answer: ", dataset["train"][0]["answers"])

id:  56be85543aeaaa14008c9063
title:  Beyoncé
Context:  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Question:  When did Beyonce start becoming popular?
Answer:  {'text': ['in the late 1990s'], 'answer_start': [269]}


In [None]:
dataset_10_percent = dataset["train"].train_test_split(test_size=0.9)["train"]

# Further split the 10% dataset into train and eval sets
dataset_split = dataset_10_percent.train_test_split(test_size=0.2)

# **Load Model**

In [None]:
tokenizer = AutoTokenizer.from_pretrained("albert-base-v2")
model = AlbertForQuestionAnswering.from_pretrained("albert/albert-base-v2")

Some weights of AlbertForQuestionAnswering were not initialized from the model checkpoint at albert/albert-base-v2 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
print(tokenizer.is_fast)

True


# **Coba tokenizer**

In [None]:
sample_question = dataset['train'][0]['question']
sample_context = dataset['train'][0]['context']
inputs = tokenizer(sample_question, sample_context)
inputs


{'input_ids': [2, 76, 144, 24809, 799, 1535, 844, 60, 3, 24809, 16004, 3745, 143, 1355, 8, 1367, 815, 13, 5, 118, 2161, 1, 728, 1, 23157, 1, 118, 12092, 8, 7370, 8, 6366, 6, 13, 5, 381, 299, 268, 15, 2229, 6, 25, 40, 189, 1377, 15, 7815, 15, 571, 1421, 17, 2182, 9, 386, 17, 1127, 19, 4187, 15, 1338, 15, 39, 986, 19, 617, 3385, 17, 4626, 5868, 28, 21, 850, 15, 17, 1092, 20, 2720, 19, 14, 456, 961, 18, 28, 672, 1377, 16, 761, 1569, 220, 695, 8, 8024, 11271, 22, 18, 850, 9, 1471, 34, 36, 321, 15, 17677, 143, 1355, 15, 14, 214, 178, 53, 16, 14, 126, 22, 18, 246, 8, 10033, 695, 1170, 16, 65, 85, 9, 66, 16436, 441, 14, 830, 16, 24809, 22, 18, 893, 244, 15, 23853, 19, 339, 13, 5, 3325, 6, 15, 56, 613, 36, 28, 21, 2046, 1169, 3497, 15, 1931, 355, 8877, 1160, 17, 1070, 14, 3304, 1047, 808, 234, 8, 849, 2391, 13, 7, 23282, 19, 339, 7, 17, 13, 7, 12152, 883, 7, 9, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [None]:
print(tokenizer.decode(inputs["input_ids"]))

[CLS] when did beyonce start becoming popular?[SEP] beyonce giselle knowles-carter (/bi<unk>j<unk>nse<unk>/ bee-yon-say) (born september 4, 1981) is an american singer, songwriter, record producer and actress. born and raised in houston, texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of r&b girl-group destiny's child. managed by her father, mathew knowles, the group became one of the world's best-selling girl groups of all time. their hiatus saw the release of beyonce's debut album, dangerously in love (2003), which established her as a solo artist worldwide, earned five grammy awards and featured the billboard hot 100 number-one singles "crazy in love" and "baby boy".[SEP]


# **Preprocess**

In [None]:
max_query_length = 64
max_seq_length = 386
doc_stride = 128


def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_seq_length,
        truncation="only_second",
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []
    cls_token_id = tokenizer.cls_token_id

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        if len(answer["answer_start"]) == 0:

            start_positions.append(cls_token_id)
            end_positions.append(cls_token_id)
        else:
            start_char = answer["answer_start"][0]
            end_char = answer["answer_start"][0] + len(answer["text"][0])
            sequence_ids = inputs.sequence_ids(i)

            idx = 0
            while sequence_ids[idx] != 1:
                idx += 1
            context_start = idx
            while sequence_ids[idx] == 1:
                idx += 1
            context_end = idx - 1


            if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
                start_positions.append(0)
                end_positions.append(0)
            else:

                idx = context_start
                while idx <= context_end and offset[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)

                idx = context_end
                while idx >= context_start and offset[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
def preprocess_validation_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_seq_length,
        truncation="only_second",
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    example_ids = []

    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i]
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i)
        offset = inputs["offset_mapping"][i]
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]

    inputs["example_id"] = example_ids
    return inputs

In [None]:
train_dataset = dataset_split["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=dataset_split["train"].column_names,
)
len(dataset_split["train"]), len(train_dataset)

Map:   0%|          | 0/10424 [00:00<?, ? examples/s]

(10424, 10548)

In [None]:
validation_dataset = dataset_split["test"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=dataset_split["test"].column_names,
)


Map:   0%|          | 0/2607 [00:00<?, ? examples/s]

In [None]:
len(dataset_split["test"]), len(validation_dataset)

(2607, 2631)

In [None]:
n_best = 20
max_answer_length = 30
predicted_answers = []

In [None]:
metric = evaluate.load("squad")

In [None]:
def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []

        # Loop through all features associated with that example
        for feature_index in example_to_features[example_id]:
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logit)[-1 : -n_best - 1 : -1].tolist()
            end_indexes = np.argsort(end_logit)[-1 : -n_best - 1 : -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip answers that are not fully in the context
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Skip answers with a length that is either < 0 or > max_answer_length
                    if (
                        end_index < start_index
                        or end_index - start_index + 1 > max_answer_length
                    ):
                        continue

                    answer = {
                        "text": context[offsets[start_index][0] : offsets[end_index][1]],
                        "logit_score": start_logit[start_index] + end_logit[end_index],
                    }
                    answers.append(answer)

        # Select the answer with the best score
        if len(answers) > 0:
            best_answer = max(answers, key=lambda x: x["logit_score"])
            predicted_answers.append(
                {"id": example_id, "prediction_text": best_answer["text"]}
            )
        else:
            predicted_answers.append({"id": example_id, "prediction_text": ""})

    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)



In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    return_loss=True
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.2809,No log
2,1.0369,No log
3,0.3639,No log


TrainOutput(global_step=1980, training_loss=1.0759086218747225, metrics={'train_runtime': 1484.01, 'train_samples_per_second': 21.323, 'train_steps_per_second': 1.334, 'total_flos': 526844664929952.0, 'train_loss': 1.0759086218747225, 'epoch': 3.0})

In [None]:
predictions, _, _ = trainer.predict(validation_dataset)
start_logits, end_logits = predictions
metrics = compute_metrics(start_logits, end_logits, validation_dataset, dataset_split["test"])

  0%|          | 0/2607 [00:00<?, ?it/s]

ValueError: max() arg is an empty sequence

In [None]:
dataset_split["test"]


Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 2607
})

In [None]:
def inference(question, context):
  inputs = tokenizer.encode_plus(question, context, return_tensors="pt", max_length=512, truncation=True)
  print(inputs)

  inputs = {k: v.to(trainer.args.device) for k, v in inputs.items()}

  # Perform inference
  with torch.no_grad():
      outputs = model(**inputs)


  # Get the most probable answer
  start_index = torch.argmax(outputs.start_logits)
  end_index = torch.argmax(outputs.end_logits)
  print(start_index)
  answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs["input_ids"][0][start_index:end_index+1]))
  print(inputs['input_ids'][0][start_index])
  decoded_text = tokenizer.decode(inputs['input_ids'][0][start_index])
  decoded_text = tokenizer.decode(inputs['input_ids'][0][start_index])
  print(decoded_text)
  print("Question:", question)
  print("Answer:", answer)


In [None]:
# Tokenize inputs
context = "Gajah Mada (c. 1290 – c. 1364), also known as Jirnnodhara,[3] was a \
          powerful military leader and mahapatih (the approximate equivalent of \
          a modern prime minister) of the Javanese empire of Majapahit during \
          the 14th century. He is credited in Old Javanese manuscripts, poems, \
          and inscriptions with bringing the empire to its peak of glory."
question = "Who was credited in Old Javanese manuscripts?"

In [None]:
inference(question, context)

{'input_ids': tensor([[    2,    72,    23,  4976,    19,   315, 28738, 11182,    60,     3,
          4369, 10671,    13, 17680,    13,     5,   150,     9,   390,  3165,
            13,    10,   272,     9,   539,  3470,     6,    15,    67,   167,
            28,  7022,  5778,   251,    43,  4297,    15,  2558,   240,   500,
            23,    21,  2177,   611,  1156,    17,  6421, 10563,   252,    13,
             5,   124, 16569,  4602,    16,    21,   773,  1621,   789,     6,
            16,    14, 28738,  1563,    16,  7083,  7738, 10242,   112,    14,
           513,    96,   428,     9,    24,    25,  4976,    19,   315, 28738,
         11182,    15,  4840,    15,    17, 14501,    29,  3657,    14,  1563,
            20,    82,  3059,    16,  7916,     9,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1

In [None]:
context = "The Apollo program, also known as Project Apollo, was the third United States human spaceflight program carried out by the National Aeronautics and Space Administration (NASA). The program succeeded in landing the first humans on the Moon from 1969 to 1972. Apollo set major milestones in human spaceflight and space exploration, leading the United States to become the first and only country to have landed humans on the Moon. The first landing happened on July 20, 1969, when Neil Armstrong and Buzz Aldrin, both American astronauts, landed the Apollo Lunar Module Eagle on the Moon's surface."
question = "Who were the first humans to land on the Moon?"

inference(question, context)

{'input_ids': tensor([[    2,    72,    46,    14,    64,  2840,    20,   476,    27,    14,
          2121,    60,     3,    14,  8532,   625,    15,    67,   167,    28,
           669,  8532,    15,    23,    14,   422,   181,   202,   585,   726,
         14750,   625,  1521,    70,    34,    14,   152, 24874,    18,    17,
           726,  1603,    13,     5,  6169,    58,     6,     9,    14,   625,
          2914,    19,  3090,    14,    64,  2840,    27,    14,  2121,    37,
          2533,    20,  2249,     9,  8532,   309,   394, 19104,    18,    19,
           585,   726, 14750,    17,   726,  8284,    15,  1005,    14,   181,
           202,    20,   533,    14,    64,    17,   104,   475,    20,    57,
          4388,  2840,    27,    14,  2121,     9,    14,    64,  3090,  1190,
            27,   313,   434,    15,  2533,    15,    76,  5870,  9158,    17,
          9122,    13,  8100,  2445,    15,   156,   189, 15226,    18,    15,
          4388,    14,  8532, 12710, 1

In [None]:
inference("what is my name?", "my name is chandra")

{'input_ids': tensor([[    2,    98,    25,    51,   204,    60,     3,    51,   204,    25,
         12598,     3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
tensor(10, device='cuda:0')
tensor(12598, device='cuda:0')
chandra
Question: what is my name?
Answer: chandra


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model_save_path = "/content/drive/MyDrive/NLP/fine_tuned_albert"
trainer.model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

('/content/drive/MyDrive/NLP/fine_tuned_albert/tokenizer_config.json',
 '/content/drive/MyDrive/NLP/fine_tuned_albert/special_tokens_map.json',
 '/content/drive/MyDrive/NLP/fine_tuned_albert/spiece.model',
 '/content/drive/MyDrive/NLP/fine_tuned_albert/added_tokens.json',
 '/content/drive/MyDrive/NLP/fine_tuned_albert/tokenizer.json')

In [None]:
question = "When did Beyonce start becoming popular?"
context = "Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles 'Crazy in Love' and 'Baby Boy'."

answer=qa_pipeline(question=question,
             context=context)
print('\n',answer['answer'])


 late 1990s


In [None]:
question = "Charlie's Angels featured which single from the band members?"
context = "The remaining band members recorded Independent Women Part I, which appeared on the soundtrack to the 2000 film, Charlie's Angels. It became their best-charting single, topping the U.S. Billboard Hot 100 chart for eleven consecutive weeks. In early 2001, while Destiny's Child was completing their third album, Beyoncé landed a major role in the MTV made-for-television film, Carmen: A Hip Hopera, starring alongside American actor Mekhi Phifer. Set in Philadelphia, the film is a modern interpretation of the 19th century opera Carmen by French composer Georges Bizet. When the third album Survivor was released in May 2001, Luckett and Roberson filed a lawsuit claiming that the songs were aimed at them. The album debuted at number one on the U.S. Billboard 200, with first-week sales of 663,000 copies sold. The album spawned other number-one hits, Bootylicious and the title track, Survivor, the latter of which earned the group a Grammy Award for Best R&B Performance by a Duo or Group with Vocals. After releasing their holiday album 8 Days of Christmas in October 2001, the group announced a hiatus to further pursue solo careers."

answer=qa_pipeline(question=question,
             context=context)
print('\n',answer['answer'])


 Independent Women Part I,


In [None]:
question = "how do we achieve a healthy body"
context = "an old man once said that the keys to everlasting wealth are health, friends, and family. A healthy body will give you strength, to achieve this one must eat healthily and excercise daily"

answer=qa_pipeline(question=question,
             context=context)
print('\n',answer['answer'])


 eat healthily and excercise


In [None]:
#Terjadi Kesalahan dalam menjawab pertanyaan jika di dalam context, jawaban dipisah menjadi beberapa bagian

question = "what can we do to get a healthy body"
context = "an old man once said that the keys to everlasting wealth are health, friends, and family. A healthy body will give you strength, to achieve this one must eat healthily and excercise daily. other than that to get a healthy body one can also relax the body with meditation."

answer=qa_pipeline(question=question,
             context=context)
print('\n',answer['answer'])


 relax the body with meditation.


In [None]:
#mampu menjawab pertanyaan mengenai subjek dengan pertanyaan spesifik
question = "who said that the keys to everlasting wealth are health, friends, and family"
context = "an old man named george luther spencer the fiftienth once said that the keys to everlasting wealth are health, friends, and family. A healthy body will give you strength, to achieve this one must eat healthily and excercise daily. other than that to get a healthy body one can also relax the body with meditation."

answer=qa_pipeline(question=question,
             context=context)
print('\n',answer['answer'])


 george luther spencer the fiftienth


In [None]:
#mampu menjawab pertanyaan mengenai subjek dengan pertanyaan non spesifik
question = "who said that"
context = "an old man named george luther spencer the fiftienth once said that the keys to everlasting wealth are health, friends, and family. A healthy body will give you strength, to achieve this one must eat healthily and excercise daily. other than that to get a healthy body one can also relax the body with meditation."

answer=qa_pipeline(question=question,
             context=context)
print('\n',answer['answer'])


 george luther spencer the fiftienth


In [None]:
#tidak mampu menjawab pertanyaan mengenai subjek dengan pertanyaan yes or no
question = "is george an old man"
context = "an old man named george luther spencer the fiftienth once said that the keys to everlasting wealth are health, friends, and family. A healthy body will give you strength, to achieve this one must eat healthily and excercise daily. other than that to get a healthy body one can also relax the body with meditation."

answer=qa_pipeline(question=question,
             context=context)
print('\n',answer['answer'])


 an old man named george luther spencer the fiftienth


In [None]:
question = "how old is max"
context = "Max verstappen, a 20 year old formula 1 champion, is highly praised by many fans in his home country 'belgium'"

answer=qa_pipeline(question=question,
             context=context)
print('\n',answer['answer'])


 20


In [None]:
question = "where is max's home country"
context = "Max verstappen, a 20 year old formula 1 champion, is highly praised by many fans in his home country 'belgium'"

answer=qa_pipeline(question=question,
             context=context)
print('\n',answer['answer'])


 'belgium'


In [None]:
question = "who is max's rival"
context = "Max verstappen, a 20 year old formula 1 champion, is highly praised by many fans in his home country 'belgium'. He is praised because of his amazing performances during the formula 1 championship in 2023 where he manages to beat his rival lewis hamilton."

answer=qa_pipeline(question=question,
             context=context)
print('\n',answer['answer'])


 lewis hamilton.


In [None]:
question = "why is max praised"
context = "Max verstappen, a 20 year old formula 1 champion, is highly praised by many fans in his home country 'belgium'. test gap .He is praised because of his amazing performances during the formula 1 championship in 2023 where he manages to beat his rival lewis hamilton."

answer=qa_pipeline(question=question,
             context=context)
print('\n',answer['answer'])


 because of his amazing performances during the formula 1 championship
