In [1]:
import transformers 
import datasets
import torch
import logging

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Preprocessing
# sep_token = '<sep>'
dataset_name = "squad"
model_type="electra"
model_name= "google/electra-base-discriminator"
models_dir = "saved_models/electra-base-discriminator_squad_mod"
checkpoint = 'electra'
max_input_length = 386


# ## Training
learning_rate = 3e-5
num_epochs = 3

In [3]:
dataset=torch.load("../datasets/squad.pt")

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [4]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

In [5]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_input_length ,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [6]:
tokenized_squad = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

Loading cached processed dataset at C:\Users\dama_\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453\cache-87877fe25e986fb6.arrow
Loading cached processed dataset at C:\Users\dama_\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453\cache-51f0916044623eae.arrow


In [7]:
data_collator = transformers.DefaultDataCollator()

In [8]:
model = transformers.AutoModelForQuestionAnswering.from_pretrained(model_name)

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraForQuestionAnswering: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForQuestionAnswering were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['qa_outputs.weight', 'qa_outputs.

In [9]:
training_args = transformers.TrainingArguments(
    output_dir=models_dir,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

e:\HOGENT\2022_2023\BA\BP_Info_Support\bert\saved_models/electra-base-discriminator_squad_mod is already a clone of https://huggingface.co/damapika/electra-base-discriminator_squad_mod. Make sure you pull the latest changes with `repo.git_pull()`.


In [10]:
torch.cuda.memory_summary(device=None, abbreviated=False)
torch.cuda.empty_cache()

In [11]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdamapika[0m. Use [1m`wandb login --relogin`[0m to force relogin


  2%|▏         | 500/32850 [17:54<19:34:07,  2.18s/it]

{'loss': 2.1018, 'learning_rate': 2.954337899543379e-05, 'epoch': 0.05}


  3%|▎         | 1000/32850 [36:03<19:19:25,  2.18s/it]

{'loss': 1.2541, 'learning_rate': 2.908675799086758e-05, 'epoch': 0.09}


  5%|▍         | 1500/32850 [51:16<17:28:41,  2.01s/it]

{'loss': 1.1761, 'learning_rate': 2.863013698630137e-05, 'epoch': 0.14}


  6%|▌         | 2000/32850 [1:08:27<17:11:37,  2.01s/it]

{'loss': 1.11, 'learning_rate': 2.8173515981735162e-05, 'epoch': 0.18}


  8%|▊         | 2500/32850 [1:23:33<14:41:25,  1.74s/it]

{'loss': 1.0704, 'learning_rate': 2.771689497716895e-05, 'epoch': 0.23}


  9%|▉         | 3000/32850 [1:38:26<14:24:45,  1.74s/it]

{'loss': 1.0632, 'learning_rate': 2.726027397260274e-05, 'epoch': 0.27}


 11%|█         | 3500/32850 [1:53:19<14:10:56,  1.74s/it]

{'loss': 0.9968, 'learning_rate': 2.680365296803653e-05, 'epoch': 0.32}


 12%|█▏        | 4000/32850 [2:08:12<13:56:41,  1.74s/it]

{'loss': 1.0032, 'learning_rate': 2.634703196347032e-05, 'epoch': 0.37}


 14%|█▎        | 4500/32850 [2:17:02<3:59:03,  1.98it/s] 

{'loss': 0.9867, 'learning_rate': 2.589041095890411e-05, 'epoch': 0.41}


 15%|█▌        | 5000/32850 [2:21:39<3:56:02,  1.97it/s] 

{'loss': 0.9362, 'learning_rate': 2.54337899543379e-05, 'epoch': 0.46}


 17%|█▋        | 5500/32850 [2:26:16<3:52:22,  1.96it/s] 

{'loss': 0.9576, 'learning_rate': 2.497716894977169e-05, 'epoch': 0.5}


 18%|█▊        | 6000/32850 [2:30:54<3:46:53,  1.97it/s] 

{'loss': 0.9167, 'learning_rate': 2.452054794520548e-05, 'epoch': 0.55}


 20%|█▉        | 6500/32850 [2:35:31<3:43:17,  1.97it/s] 

{'loss': 0.9153, 'learning_rate': 2.406392694063927e-05, 'epoch': 0.59}


 21%|██▏       | 7000/32850 [2:40:08<3:38:20,  1.97it/s] 

{'loss': 0.8892, 'learning_rate': 2.360730593607306e-05, 'epoch': 0.64}


 23%|██▎       | 7500/32850 [2:44:51<10:28:48,  1.49s/it]

{'loss': 0.9154, 'learning_rate': 2.315068493150685e-05, 'epoch': 0.68}


 24%|██▍       | 8000/32850 [2:59:28<11:48:09,  1.71s/it]

{'loss': 0.8877, 'learning_rate': 2.2694063926940642e-05, 'epoch': 0.73}


 26%|██▌       | 8500/32850 [3:14:06<11:34:58,  1.71s/it]

{'loss': 0.8764, 'learning_rate': 2.223744292237443e-05, 'epoch': 0.78}


 27%|██▋       | 9000/32850 [3:28:43<11:18:47,  1.71s/it]

{'loss': 0.8878, 'learning_rate': 2.178082191780822e-05, 'epoch': 0.82}


 29%|██▉       | 9500/32850 [3:43:21<11:05:09,  1.71s/it]

{'loss': 0.8917, 'learning_rate': 2.132420091324201e-05, 'epoch': 0.87}


 30%|███       | 10000/32850 [3:57:59<10:50:13,  1.71s/it]

{'loss': 0.8854, 'learning_rate': 2.08675799086758e-05, 'epoch': 0.91}


 32%|███▏      | 10500/32850 [4:03:03<3:08:25,  1.98it/s] 

{'loss': 0.8766, 'learning_rate': 2.041095890410959e-05, 'epoch': 0.96}


                                                          
 33%|███▎      | 10950/32850 [4:10:36<3:02:39,  2.00it/s]

{'eval_loss': 0.865917980670929, 'eval_runtime': 201.3066, 'eval_samples_per_second': 52.507, 'eval_steps_per_second': 6.567, 'epoch': 1.0}


 33%|███▎      | 11000/32850 [4:11:01<3:05:17,  1.97it/s]  

{'loss': 0.7999, 'learning_rate': 1.995433789954338e-05, 'epoch': 1.0}


 35%|███▌      | 11500/32850 [4:15:40<3:01:22,  1.96it/s] 

{'loss': 0.6057, 'learning_rate': 1.949771689497717e-05, 'epoch': 1.05}


 37%|███▋      | 12000/32850 [4:20:18<2:55:56,  1.98it/s] 

{'loss': 0.5973, 'learning_rate': 1.904109589041096e-05, 'epoch': 1.1}


 38%|███▊      | 12500/32850 [4:24:56<2:52:19,  1.97it/s] 

{'loss': 0.6338, 'learning_rate': 1.858447488584475e-05, 'epoch': 1.14}


 40%|███▉      | 13000/32850 [4:30:27<9:29:42,  1.72s/it] 

{'loss': 0.6296, 'learning_rate': 1.812785388127854e-05, 'epoch': 1.19}


 41%|████      | 13500/32850 [4:45:05<9:14:35,  1.72s/it] 

{'loss': 0.654, 'learning_rate': 1.767123287671233e-05, 'epoch': 1.23}


 43%|████▎     | 14000/32850 [4:59:49<9:00:09,  1.72s/it] 

{'loss': 0.6223, 'learning_rate': 1.721461187214612e-05, 'epoch': 1.28}


 44%|████▍     | 14500/32850 [5:14:32<8:46:14,  1.72s/it] 

{'loss': 0.6237, 'learning_rate': 1.675799086757991e-05, 'epoch': 1.32}


 46%|████▌     | 15000/32850 [5:29:16<8:31:15,  1.72s/it] 

{'loss': 0.6538, 'learning_rate': 1.63013698630137e-05, 'epoch': 1.37}


Several commits (2) will be pushed upstream.
 47%|████▋     | 15500/32850 [5:44:00<8:17:08,  1.72s/it] 

{'loss': 0.6284, 'learning_rate': 1.584474885844749e-05, 'epoch': 1.42}


Several commits (3) will be pushed upstream.
 49%|████▊     | 16000/32850 [5:58:44<8:03:24,  1.72s/it] 

{'loss': 0.5972, 'learning_rate': 1.538812785388128e-05, 'epoch': 1.46}


Several commits (4) will be pushed upstream.
 50%|█████     | 16500/32850 [6:13:28<7:49:38,  1.72s/it] 

{'loss': 0.63, 'learning_rate': 1.4931506849315068e-05, 'epoch': 1.51}


Several commits (5) will be pushed upstream.
 52%|█████▏    | 17000/32850 [6:28:12<7:34:38,  1.72s/it] 

{'loss': 0.6499, 'learning_rate': 1.4474885844748858e-05, 'epoch': 1.55}


Several commits (6) will be pushed upstream.
 53%|█████▎    | 17500/32850 [6:42:56<7:20:58,  1.72s/it] 

{'loss': 0.6084, 'learning_rate': 1.4018264840182649e-05, 'epoch': 1.6}


Several commits (7) will be pushed upstream.
 55%|█████▍    | 18000/32850 [6:57:42<7:06:41,  1.72s/it] 

{'loss': 0.6028, 'learning_rate': 1.3561643835616437e-05, 'epoch': 1.64}


Several commits (8) will be pushed upstream.
 56%|█████▋    | 18500/32850 [7:12:26<6:51:20,  1.72s/it] 

{'loss': 0.5988, 'learning_rate': 1.3105022831050228e-05, 'epoch': 1.69}


Several commits (9) will be pushed upstream.
 58%|█████▊    | 19000/32850 [7:27:09<6:37:44,  1.72s/it] 

{'loss': 0.5994, 'learning_rate': 1.2648401826484018e-05, 'epoch': 1.74}


Several commits (10) will be pushed upstream.
 59%|█████▉    | 19500/32850 [7:41:53<6:23:14,  1.72s/it] 

{'loss': 0.5989, 'learning_rate': 1.2191780821917808e-05, 'epoch': 1.78}


Several commits (11) will be pushed upstream.
 61%|██████    | 20000/32850 [7:56:37<6:08:20,  1.72s/it] 

{'loss': 0.6069, 'learning_rate': 1.1735159817351598e-05, 'epoch': 1.83}


Several commits (12) will be pushed upstream.
 62%|██████▏   | 20500/32850 [8:11:09<5:50:46,  1.70s/it] 

{'loss': 0.633, 'learning_rate': 1.1278538812785387e-05, 'epoch': 1.87}


Several commits (13) will be pushed upstream.
 64%|██████▍   | 21000/32850 [8:25:46<5:36:10,  1.70s/it] 

{'loss': 0.5917, 'learning_rate': 1.0821917808219177e-05, 'epoch': 1.92}


Several commits (14) will be pushed upstream.
 65%|██████▌   | 21500/32850 [8:40:23<5:23:06,  1.71s/it] 

{'loss': 0.6171, 'learning_rate': 1.0365296803652968e-05, 'epoch': 1.96}


Several commits (15) will be pushed upstream.
                                                          
 67%|██████▋   | 21900/32850 [9:03:05<5:00:55,  1.65s/it]

{'eval_loss': 0.8855888247489929, 'eval_runtime': 655.445, 'eval_samples_per_second': 16.126, 'eval_steps_per_second': 2.017, 'epoch': 2.0}


 67%|██████▋   | 22000/32850 [9:05:56<5:08:47,  1.71s/it]   

{'loss': 0.5755, 'learning_rate': 9.908675799086758e-06, 'epoch': 2.01}


Several commits (16) will be pushed upstream.
 68%|██████▊   | 22500/32850 [9:20:32<4:54:28,  1.71s/it] 

{'loss': 0.4378, 'learning_rate': 9.452054794520548e-06, 'epoch': 2.05}


Several commits (17) will be pushed upstream.
 70%|███████   | 23000/32850 [9:35:08<4:40:25,  1.71s/it] 

{'loss': 0.4269, 'learning_rate': 8.995433789954338e-06, 'epoch': 2.1}


Several commits (18) will be pushed upstream.
 72%|███████▏  | 23500/32850 [9:49:45<4:25:52,  1.71s/it] 

{'loss': 0.3906, 'learning_rate': 8.538812785388127e-06, 'epoch': 2.15}


Several commits (19) will be pushed upstream.
 73%|███████▎  | 24000/32850 [10:04:22<4:11:46,  1.71s/it]

{'loss': 0.4234, 'learning_rate': 8.082191780821917e-06, 'epoch': 2.19}


Several commits (20) will be pushed upstream.
 75%|███████▍  | 24500/32850 [10:18:59<3:57:40,  1.71s/it] 

{'loss': 0.4117, 'learning_rate': 7.6255707762557076e-06, 'epoch': 2.24}


Several commits (21) will be pushed upstream.
 76%|███████▌  | 25000/32850 [10:33:34<3:43:27,  1.71s/it] 

{'loss': 0.4117, 'learning_rate': 7.168949771689498e-06, 'epoch': 2.28}


Several commits (22) will be pushed upstream.
 78%|███████▊  | 25500/32850 [10:48:11<3:29:16,  1.71s/it] 

{'loss': 0.4114, 'learning_rate': 6.712328767123287e-06, 'epoch': 2.33}


Several commits (23) will be pushed upstream.
 79%|███████▉  | 26000/32850 [11:02:48<3:14:54,  1.71s/it] 

{'loss': 0.3956, 'learning_rate': 6.2557077625570776e-06, 'epoch': 2.37}


Several commits (24) will be pushed upstream.
 81%|████████  | 26500/32850 [11:17:25<3:00:33,  1.71s/it] 

{'loss': 0.4057, 'learning_rate': 5.799086757990867e-06, 'epoch': 2.42}


Several commits (25) will be pushed upstream.
 82%|████████▏ | 27000/32850 [11:32:02<2:46:17,  1.71s/it] 

{'loss': 0.4133, 'learning_rate': 5.342465753424657e-06, 'epoch': 2.47}


Several commits (26) will be pushed upstream.
 84%|████████▎ | 27500/32850 [11:46:38<2:32:23,  1.71s/it] 

{'loss': 0.4239, 'learning_rate': 4.8858447488584476e-06, 'epoch': 2.51}


Several commits (27) will be pushed upstream.
 85%|████████▌ | 28000/32850 [12:01:15<2:18:10,  1.71s/it] 

{'loss': 0.4033, 'learning_rate': 4.429223744292237e-06, 'epoch': 2.56}


Several commits (28) will be pushed upstream.
 87%|████████▋ | 28500/32850 [12:15:46<2:03:50,  1.71s/it] 

{'loss': 0.4098, 'learning_rate': 3.972602739726027e-06, 'epoch': 2.6}


Several commits (29) will be pushed upstream.
 88%|████████▊ | 29000/32850 [12:30:22<1:49:29,  1.71s/it] 

{'loss': 0.4003, 'learning_rate': 3.515981735159817e-06, 'epoch': 2.65}


Several commits (30) will be pushed upstream.
 90%|████████▉ | 29500/32850 [12:44:59<1:35:20,  1.71s/it]

{'loss': 0.3817, 'learning_rate': 3.0593607305936074e-06, 'epoch': 2.69}


Several commits (31) will be pushed upstream.
 91%|█████████▏| 30000/32850 [12:59:35<1:21:12,  1.71s/it]

{'loss': 0.3995, 'learning_rate': 2.6027397260273973e-06, 'epoch': 2.74}


Several commits (32) will be pushed upstream.
 93%|█████████▎| 30500/32850 [13:14:12<1:06:45,  1.70s/it]

{'loss': 0.404, 'learning_rate': 2.146118721461187e-06, 'epoch': 2.79}


Several commits (33) will be pushed upstream.
 94%|█████████▍| 31000/32850 [13:28:49<52:39,  1.71s/it]  

{'loss': 0.3801, 'learning_rate': 1.6894977168949772e-06, 'epoch': 2.83}


Several commits (34) will be pushed upstream.
 96%|█████████▌| 31500/32850 [13:43:25<38:21,  1.70s/it]  

{'loss': 0.4002, 'learning_rate': 1.232876712328767e-06, 'epoch': 2.88}


Several commits (35) will be pushed upstream.
 97%|█████████▋| 32000/32850 [13:58:03<24:08,  1.70s/it]  

{'loss': 0.3924, 'learning_rate': 7.76255707762557e-07, 'epoch': 2.92}


Several commits (36) will be pushed upstream.
 99%|█████████▉| 32500/32850 [14:12:40<09:57,  1.71s/it]  

{'loss': 0.4249, 'learning_rate': 3.19634703196347e-07, 'epoch': 2.97}


Several commits (37) will be pushed upstream.
                                                        
100%|██████████| 32850/32850 [14:33:57<00:00,  1.60s/it]

{'eval_loss': 1.0307191610336304, 'eval_runtime': 655.4424, 'eval_samples_per_second': 16.127, 'eval_steps_per_second': 2.017, 'epoch': 3.0}
{'train_runtime': 52440.3864, 'train_samples_per_second': 5.011, 'train_steps_per_second': 0.626, 'train_loss': 0.6815786130025506, 'epoch': 3.0}





TrainOutput(global_step=32850, training_loss=0.6815786130025506, metrics={'train_runtime': 52440.3864, 'train_samples_per_second': 5.011, 'train_steps_per_second': 0.626, 'train_loss': 0.6815786130025506, 'epoch': 3.0})

In [18]:
trainer.push_to_hub()

Several commits (39) will be pushed upstream.
The progress bars may be unreliable.
Upload file pytorch_model.bin: 427MB [22:08, 11.5MB/s]                            To https://huggingface.co/damapika/electra-base-discriminator_squad_mod
   53f9db0..a45c6dc  main -> main

Upload file pytorch_model.bin: 100%|██████████| 415M/415M [22:10<00:00, 327kB/s] 
Upload file runs/May01_14-59-48_Damapika/events.out.tfevents.1682945996.Damapika.26664.0: 100%|██████████| 15.3k/15.3k [02:19<00:00, 113B/s]  


In [19]:
question = "Who is Fyodor Dostoevsky?"
context = "In the world of literature, there have been many authors who have gained a reputation for their ability to create complex characters. One such author is Fyodor Dostoevsky, a Russian novelist who wrote several influential works in the 19th century."

In [20]:
question_answerer = transformers.pipeline("question-answering", model="damapika/electra-base-discriminator_squad_mod")
question_answerer(question=question, context=context)

{'score': 0.9061723351478577,
 'start': 172,
 'end': 190,
 'answer': 'a Russian novelist'}