In [27]:
import transformers
import datasets
import torch
import logging
from transformers import TrainingArguments, Trainer

In [13]:
# dataset = datasets.load_dataset("adversarial_qa","adversarialQA")
dataset = datasets.load_dataset("quoref")

Found cached dataset quoref (C:/Users/dama_/.cache/huggingface/datasets/quoref/default/0.1.0/82bb58a6b25cd8dbb4625a7ba6a5d0a224af1f4d392ca0de8b9e0c23e78557fe)
100%|██████████| 2/2 [00:00<00:00, 285.74it/s]


In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'metadata'],
        num_rows: 30000
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'metadata'],
        num_rows: 3000
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers', 'metadata'],
        num_rows: 3000
    })
})

In [28]:
dataset_name = "quoref"
model_type="bert"
model_name= "distilbert-base-uncased"
models_dir = "saved_models/distilbert-base-uncased_mod"
checkpoint = 'damapika/distilbert-base-uncased_mod'
max_input_length = 308

# ## Training
learning_rate = 3e-5
num_epochs = 3

In [21]:
tokenizer = transformers.AutoTokenizer.from_pretrained(checkpoint)
model = transformers.AutoModelForQuestionAnswering.from_pretrained(checkpoint)

In [22]:
# calculate max context length for dataset
def calc_max_len(dataset):
  context_length_max=len(dataset[0]['context'])
  for i in range(len(dataset)):
    con_len=len(dataset[i]['context'])
    if(con_len<context_length_max):
      context_length_max=con_len
      print(context_length_max)
      print(dataset[i]['context'])
  return context_length_max

In [19]:
calc_max_len(dataset['train'])

1321
In 1919, the Chicago White Sox are considered one of the greatest baseball teams ever assembled; however, the team's stingy owner, Charles Comiskey, gives little inclination to reward his players for a spectacular season.
Gamblers "Sleepy" Bill Burns and Billy Maharg get wind of the players' discontent, asking shady player Chick Gandil to convince a select group of Sox—including star knuckleball pitcher Eddie Cicotte, who led the majors with a 29–7 win–loss record and an earned run average of 1.82—that they could earn more money by playing badly and throwing the series than they could earn by winning the World Series against the Cincinnati Reds . Cicotte was motivated because Comiskey refused him a promised $10,000 should he win 30 games for the season. Cicotte was nearing the milestone until Comiskey ordered team manager Kid Gleason to bench him for 2 weeks (missing 5 starts) with the excuse that the 35-year-old veteran's arm needed a rest before the series.
A number of players, 

308

In [23]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_input_length ,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [24]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset['train'].column_names)

                                                                   

In [25]:
data_collator = transformers.DefaultDataCollator()

In [26]:
torch.cuda.empty_cache()


In [29]:
training_args = TrainingArguments(
    output_dir=models_dir,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

Cloning https://huggingface.co/damapika/distilbert-base-uncased_mod into local empty directory.
Download file pytorch_model.bin:   0%|          | 8.74k/253M [00:00<?, ?B/s]
[A

[A[A


[A[A[A



[A[A[A[A




[A[A[A[A[A





Download file pytorch_model.bin:   1%|          | 1.28M/253M [00:01<03:21, 1.31MB/s]






[A[A[A[A[A[A[A







[A[A[A[A[A[A[A[A








[A[A[A[A[A[A[A[A[A









[A[A[A[A[A[A[A[A[A[A










[A[A[A[A[A[A[A[A[A[A[A











[A[A[A[A[A[A[A[A[A[A[A[A












Download file pytorch_model.bin:  92%|█████████▏| 232M/253M [00:10<00:00, 26.7MB/s] 













[A[A[A[A[A[A[A[A[A[A[A[A[A[A













[A[A[A[A[A[A[A[A[A[A[A[A[A[A













[A[A[A[A[A[A[A[A[A[A[A[A[A[A













[A[A[A[A[A[A[A[A[A[A[A[A[A[A













[A[A[A[A[A[A[A[A[A[A[A[A[A[A













[A[A[A[A[A[A[A[A[A[A[A[A[A[A













[A

In [30]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdamapika[0m. Use [1m`wandb login --relogin`[0m to force relogin


 14%|█▎        | 500/3639 [12:11<1:16:49,  1.47s/it]

{'loss': 2.079, 'learning_rate': 2.5877988458367684e-05, 'epoch': 0.41}


 27%|██▋       | 1000/3639 [22:57<17:50,  2.46it/s] 

{'loss': 1.6873, 'learning_rate': 2.1755976916735367e-05, 'epoch': 0.82}


 33%|███▎      | 1213/3639 [28:28<50:22,  1.25s/it]  
 33%|███▎      | 1213/3639 [29:35<50:22,  1.25s/it]

{'eval_loss': 1.6968690156936646, 'eval_runtime': 66.7982, 'eval_samples_per_second': 36.199, 'eval_steps_per_second': 2.276, 'epoch': 1.0}


 41%|████      | 1500/3639 [36:35<52:01,  1.46s/it]   

{'loss': 1.3805, 'learning_rate': 1.763396537510305e-05, 'epoch': 1.24}


 55%|█████▍    | 2000/3639 [49:00<39:51,  1.46s/it]  

{'loss': 1.1652, 'learning_rate': 1.3511953833470735e-05, 'epoch': 1.65}


 67%|██████▋   | 2426/3639 [59:37<25:06,  1.24s/it]  
 67%|██████▋   | 2426/3639 [1:00:44<25:06,  1.24s/it]

{'eval_loss': 1.8044509887695312, 'eval_runtime': 66.6835, 'eval_samples_per_second': 36.261, 'eval_steps_per_second': 2.279, 'epoch': 2.0}


 69%|██████▊   | 2500/3639 [1:02:23<27:21,  1.44s/it]  

{'loss': 1.0974, 'learning_rate': 9.389942291838417e-06, 'epoch': 2.06}


 82%|████████▏ | 3000/3639 [1:10:34<08:14,  1.29it/s]  

{'loss': 0.8123, 'learning_rate': 5.267930750206101e-06, 'epoch': 2.47}


 96%|█████████▌| 3500/3639 [1:23:04<03:25,  1.48s/it]  

{'loss': 0.7953, 'learning_rate': 1.145919208573784e-06, 'epoch': 2.89}


100%|██████████| 3639/3639 [1:25:39<00:00,  1.24s/it]
100%|██████████| 3639/3639 [1:26:44<00:00,  1.43s/it]

{'eval_loss': 2.014676570892334, 'eval_runtime': 65.1354, 'eval_samples_per_second': 37.123, 'eval_steps_per_second': 2.334, 'epoch': 3.0}
{'train_runtime': 5207.9968, 'train_samples_per_second': 11.175, 'train_steps_per_second': 0.699, 'train_loss': 1.2709116923936812, 'epoch': 3.0}





TrainOutput(global_step=3639, training_loss=1.2709116923936812, metrics={'train_runtime': 5207.9968, 'train_samples_per_second': 11.175, 'train_steps_per_second': 0.699, 'train_loss': 1.2709116923936812, 'epoch': 3.0})

In [31]:
trainer.push_to_hub()

Upload file pytorch_model.bin: 259MB [00:25, 11.6MB/s]                            To https://huggingface.co/damapika/distilbert-base-uncased_mod
   4e3fe88..b9acdcb  main -> main

Upload file pytorch_model.bin: 100%|██████████| 253M/253M [00:27<00:00, 9.82MB/s]
Upload file runs/May19_13-33-03_Damapika/events.out.tfevents.1684496021.Damapika.4888.0: 100%|██████████| 6.14k/6.14k [00:27<00:00, 233B/s]  
To https://huggingface.co/damapika/distilbert-base-uncased_mod
   b9acdcb..e6e6b85  main -> main



'https://huggingface.co/damapika/distilbert-base-uncased_mod/commit/b9acdcb7417223d5cb3bfe46ad279ee4c89a87a5'