In [1]:
import transformers 
import datasets
import torch
import logging

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# squad = load_dataset("squad", split="train[:5000]")

# squad = datasets.load_dataset("squad")

Found cached dataset squad (C:/Users/dama_/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)
100%|██████████| 2/2 [00:00<00:00, 199.96it/s]


In [2]:
squad=torch.load("../datasets/squad.pt")

In [3]:
squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [33]:
# squad = squad.train_test_split(test_size=0.2)

In [4]:
# Preprocessing
# sep_token = '<sep>'
dataset_name = "squad"
model_type="bert"
model_name= "distilbert-base-uncased"
models_dir = "saved_models/distilbert-base-uncased_mod"
checkpoint = 'distilbert-base-uncased'
max_input_length = 384

# ## Training
learning_rate = 3e-5


In [4]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

In [5]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_input_length ,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [6]:
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Loading cached processed dataset at C:\Users\dama_\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453\cache-c6979b8502e8782b.arrow
Loading cached processed dataset at C:\Users\dama_\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453\cache-5208e4cd58802a97.arrow


In [7]:
data_collator = transformers.DefaultDataCollator()

In [8]:
model = transformers.AutoModelForQuestionAnswering.from_pretrained(model_name)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForQuestionAnswering: ['vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this mode

In [9]:
training_args = transformers.TrainingArguments(
    output_dir=models_dir,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)



e:\HOGENT\2022_2023\BA\BP_Info_Support\bert\saved_models/distilbert-base-uncased_mod is already a clone of https://huggingface.co/damapika/distilbert-base-uncased_mod. Make sure you pull the latest changes with `repo.git_pull()`.


In [10]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdamapika[0m. Use [1m`wandb login --relogin`[0m to force relogin


  3%|▎         | 500/16425 [03:58<2:05:34,  2.11it/s]

{'loss': 2.721, 'learning_rate': 2.908675799086758e-05, 'epoch': 0.09}


  6%|▌         | 1000/16425 [08:10<2:02:15,  2.10it/s]

{'loss': 1.6817, 'learning_rate': 2.8173515981735162e-05, 'epoch': 0.18}


  9%|▉         | 1500/16425 [12:24<1:58:22,  2.10it/s] 

{'loss': 1.5186, 'learning_rate': 2.726027397260274e-05, 'epoch': 0.27}


 12%|█▏        | 2000/16425 [16:39<1:54:22,  2.10it/s] 

{'loss': 1.4001, 'learning_rate': 2.634703196347032e-05, 'epoch': 0.37}


 15%|█▌        | 2500/16425 [20:53<1:50:25,  2.10it/s] 

{'loss': 1.3652, 'learning_rate': 2.54337899543379e-05, 'epoch': 0.46}


 18%|█▊        | 3000/16425 [25:07<1:46:03,  2.11it/s] 

{'loss': 1.293, 'learning_rate': 2.452054794520548e-05, 'epoch': 0.55}


 21%|██▏       | 3500/16425 [29:21<1:41:55,  2.11it/s] 

{'loss': 1.2543, 'learning_rate': 2.360730593607306e-05, 'epoch': 0.64}


 24%|██▍       | 4000/16425 [33:30<1:34:52,  2.18it/s] 

{'loss': 1.2099, 'learning_rate': 2.2694063926940642e-05, 'epoch': 0.73}


 27%|██▋       | 4500/16425 [37:37<1:31:15,  2.18it/s] 

{'loss': 1.2103, 'learning_rate': 2.178082191780822e-05, 'epoch': 0.82}


 30%|███       | 5000/16425 [41:43<1:27:20,  2.18it/s] 

{'loss': 1.1907, 'learning_rate': 2.08675799086758e-05, 'epoch': 0.91}


                                                       
 33%|███▎      | 5475/16425 [47:12<1:23:02,  2.20it/s]

{'eval_loss': 1.1237562894821167, 'eval_runtime': 95.3037, 'eval_samples_per_second': 110.909, 'eval_steps_per_second': 6.936, 'epoch': 1.0}


 33%|███▎      | 5500/16425 [47:24<1:24:56,  2.14it/s] 

{'loss': 1.1315, 'learning_rate': 1.995433789954338e-05, 'epoch': 1.0}


 37%|███▋      | 6000/16425 [51:30<1:19:50,  2.18it/s] 

{'loss': 0.8868, 'learning_rate': 1.904109589041096e-05, 'epoch': 1.1}


 40%|███▉      | 6500/16425 [55:36<1:15:53,  2.18it/s] 

{'loss': 0.9029, 'learning_rate': 1.812785388127854e-05, 'epoch': 1.19}


 43%|████▎     | 7000/16425 [59:42<1:12:05,  2.18it/s] 

{'loss': 0.9179, 'learning_rate': 1.721461187214612e-05, 'epoch': 1.28}


 46%|████▌     | 7500/16425 [1:03:47<1:08:09,  2.18it/s]

{'loss': 0.907, 'learning_rate': 1.63013698630137e-05, 'epoch': 1.37}


 49%|████▊     | 8000/16425 [1:07:53<1:04:23,  2.18it/s] 

{'loss': 0.8809, 'learning_rate': 1.538812785388128e-05, 'epoch': 1.46}


 52%|█████▏    | 8500/16425 [1:11:59<1:00:32,  2.18it/s] 

{'loss': 0.8829, 'learning_rate': 1.4474885844748858e-05, 'epoch': 1.55}


 55%|█████▍    | 9000/16425 [1:16:05<56:43,  2.18it/s]   

{'loss': 0.8647, 'learning_rate': 1.3561643835616437e-05, 'epoch': 1.64}


 58%|█████▊    | 9500/16425 [1:20:11<52:56,  2.18it/s]   

{'loss': 0.8544, 'learning_rate': 1.2648401826484018e-05, 'epoch': 1.74}


 61%|██████    | 10000/16425 [1:24:17<48:59,  2.19it/s]  

{'loss': 0.8579, 'learning_rate': 1.1735159817351598e-05, 'epoch': 1.83}


 64%|██████▍   | 10500/16425 [1:28:23<45:18,  2.18it/s]  

{'loss': 0.8831, 'learning_rate': 1.0821917808219177e-05, 'epoch': 1.92}


                                                         
 67%|██████▋   | 10950/16425 [1:33:41<41:27,  2.20it/s]

{'eval_loss': 1.0881441831588745, 'eval_runtime': 94.9508, 'eval_samples_per_second': 111.321, 'eval_steps_per_second': 6.962, 'epoch': 2.0}


 67%|██████▋   | 11000/16425 [1:34:04<41:27,  2.18it/s]   

{'loss': 0.8412, 'learning_rate': 9.908675799086758e-06, 'epoch': 2.01}


 70%|███████   | 11500/16425 [1:38:09<37:37,  2.18it/s]  

{'loss': 0.6592, 'learning_rate': 8.995433789954338e-06, 'epoch': 2.1}


 73%|███████▎  | 12000/16425 [1:42:15<33:49,  2.18it/s]  

{'loss': 0.6359, 'learning_rate': 8.082191780821917e-06, 'epoch': 2.19}


 76%|███████▌  | 12500/16425 [1:46:21<30:01,  2.18it/s]  

{'loss': 0.6299, 'learning_rate': 7.168949771689498e-06, 'epoch': 2.28}


 79%|███████▉  | 13000/16425 [1:50:27<26:09,  2.18it/s]  

{'loss': 0.6299, 'learning_rate': 6.2557077625570776e-06, 'epoch': 2.37}


 82%|████████▏ | 13500/16425 [1:54:33<22:22,  2.18it/s]  

{'loss': 0.6472, 'learning_rate': 5.342465753424657e-06, 'epoch': 2.47}


 85%|████████▌ | 14000/16425 [1:58:39<18:28,  2.19it/s]  

{'loss': 0.6387, 'learning_rate': 4.429223744292237e-06, 'epoch': 2.56}


 88%|████████▊ | 14500/16425 [2:02:44<14:42,  2.18it/s]  

{'loss': 0.6303, 'learning_rate': 3.515981735159817e-06, 'epoch': 2.65}


 91%|█████████▏| 15000/16425 [2:06:50<10:53,  2.18it/s]  

{'loss': 0.6326, 'learning_rate': 2.6027397260273973e-06, 'epoch': 2.74}


 94%|█████████▍| 15500/16425 [2:10:56<07:03,  2.18it/s]  

{'loss': 0.6274, 'learning_rate': 1.6894977168949772e-06, 'epoch': 2.83}


 97%|█████████▋| 16000/16425 [2:15:02<03:15,  2.18it/s]  

{'loss': 0.6164, 'learning_rate': 7.76255707762557e-07, 'epoch': 2.92}


                                                       
100%|██████████| 16425/16425 [2:20:08<00:00,  1.95it/s]

{'eval_loss': 1.1685752868652344, 'eval_runtime': 94.783, 'eval_samples_per_second': 111.518, 'eval_steps_per_second': 6.974, 'epoch': 3.0}
{'train_runtime': 8410.9413, 'train_samples_per_second': 31.245, 'train_steps_per_second': 1.953, 'train_loss': 0.9905400338601122, 'epoch': 3.0}





TrainOutput(global_step=16425, training_loss=0.9905400338601122, metrics={'train_runtime': 8410.9413, 'train_samples_per_second': 31.245, 'train_steps_per_second': 1.953, 'train_loss': 0.9905400338601122, 'epoch': 3.0})

In [11]:
trainer.push_to_hub()

Upload file pytorch_model.bin: 259MB [00:39, 7.26MB/s]                            To https://huggingface.co/damapika/distilbert-base-uncased_mod
   46e0351..1a9dcfe  main -> main

Upload file pytorch_model.bin: 100%|██████████| 253M/253M [00:40<00:00, 6.63MB/s]
Upload file runs/Apr21_22-30-03_Damapika/events.out.tfevents.1682109011.Damapika.33640.0: 100%|██████████| 9.89k/9.89k [00:40<00:00, 253B/s] 
To https://huggingface.co/damapika/distilbert-base-uncased_mod
   1a9dcfe..f9707b1  main -> main



'https://huggingface.co/damapika/distilbert-base-uncased_mod/commit/1a9dcfe661042d1eb582247b0b8cc042eb868153'

In [7]:
question = "Who is Fyodor Dostoevsky?"
context = "In the world of literature, there have been many authors who have gained a reputation for their ability to create complex characters. One such author is Fyodor Dostoevsky, a Russian novelist who wrote several influential works in the 19th century."

In [8]:
question_answerer = transformers.pipeline("question-answering", model="damapika/distilbert-base-uncased_mod")
question_answerer(question=question, context=context)

{'score': 0.6729146242141724,
 'start': 172,
 'end': 190,
 'answer': 'a Russian novelist'}

In [24]:
#optional  manually replicate the results of the pipeline 

tokenizer = transformers.AutoTokenizer.from_pretrained("damapika/distilbert-base-uncased_mod")
inputs = tokenizer(question, context, return_tensors="pt")

In [25]:


model = transformers.AutoModelForQuestionAnswering.from_pretrained("damapika/distilbert-base-uncased_mod")
with torch.no_grad():
    outputs = model(**inputs)

In [26]:
answer_start_index = outputs.start_logits.argmax()
answer_end_index = outputs.end_logits.argmax()

In [27]:
predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
tokenizer.decode(predict_answer_tokens)

'a russian novelist'