In [1]:
import transformers 
import datasets
import torch
from torch.utils.data import Dataset
import logging
from transformers import TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = datasets.load_dataset("quoref")


Found cached dataset quoref (C:/Users/dama_/.cache/huggingface/datasets/quoref/default/0.1.0/82bb58a6b25cd8dbb4625a7ba6a5d0a224af1f4d392ca0de8b9e0c23e78557fe)
100%|██████████| 2/2 [00:00<00:00, 334.73it/s]


In [69]:
class FilteredDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        self.filtered_indices = self._get_filtered_indices()

    def _get_filtered_indices(self):
        filtered_indices = []
        for i in range(len(self.dataset)):
            item = self.dataset[i]
            if len(item['answers']['answer_start']) > 0:
                filtered_indices.append(i)
        return filtered_indices

    def __getitem__(self, index):
        original_index = self.filtered_indices[index]
        return self.dataset[original_index]

    def __len__(self):
        return len(self.filtered_indices)

In [11]:
# Preprocessing
# sep_token = '<sep>'
dataset_name = "adversarial_qa"
model_type="roberta"
model_name= "roberta-base"
models_dir = "saved_models/roberta-base_mod_quoref"
checkpoint = 'roberta-base'
max_input_length = 308


# ## Training
learning_rate = 3e-5
num_epochs = 3

In [4]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
model = transformers.AutoModelForQuestionAnswering.from_pretrained(model_name)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForQuestionAnswering: ['lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use 

In [97]:
# calculate max context length for dataset
def calc_max_len(dataset):
  context_length_max=len(dataset[0]['context'])
  for i in range(len(dataset)):
    con_len=len(dataset[i]['context'])
    if(con_len<context_length_max):
      context_length_max=con_len
      print(context_length_max)
      print(dataset[i]['context'])
  return context_length_max

In [100]:
calc_max_len(dataset['train'])

1321
In 1919, the Chicago White Sox are considered one of the greatest baseball teams ever assembled; however, the team's stingy owner, Charles Comiskey, gives little inclination to reward his players for a spectacular season.
Gamblers "Sleepy" Bill Burns and Billy Maharg get wind of the players' discontent, asking shady player Chick Gandil to convince a select group of Sox—including star knuckleball pitcher Eddie Cicotte, who led the majors with a 29–7 win–loss record and an earned run average of 1.82—that they could earn more money by playing badly and throwing the series than they could earn by winning the World Series against the Cincinnati Reds . Cicotte was motivated because Comiskey refused him a promised $10,000 should he win 30 games for the season. Cicotte was nearing the milestone until Comiskey ordered team manager Kid Gleason to bench him for 2 weeks (missing 5 starts) with the excuse that the 35-year-old veteran's arm needed a rest before the series.
A number of players, 

308

In [106]:
dataset['train'][0]

{'id': 'ba3f052c7a557909526b59713430403dd134e01d',
 'question': 'What is the first name of the person who doubted it would turn out to be a highly explosive eruption like those that can occur in subduction-zone volcanoes?',
 'title': '2007–2008 Nazko earthquakes 1',
 'url': 'https://en.wikipedia.org/wiki/2007%E2%80%932008_Nazko_earthquakes',
 'answers': {'answer_start': [250], 'text': ['Catherine']}}

In [5]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_input_length ,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [6]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset['train'].column_names)

Loading cached processed dataset at C:\Users\dama_\.cache\huggingface\datasets\quoref\default\0.1.0\82bb58a6b25cd8dbb4625a7ba6a5d0a224af1f4d392ca0de8b9e0c23e78557fe\cache-9b64e73348f9143b.arrow
                                                                 

In [7]:
data_collator = transformers.DefaultDataCollator()

In [114]:
torch.cuda.empty_cache()


In [118]:
# Check if CUDA is available
if torch.cuda.is_available():
    # Set the device to CUDA
    device = torch.device('cuda')
    print('gpu')
else:
    # If CUDA is not available, fall back to CPU
    device = torch.device('cpu')
    print('cpu')

gpu


In [8]:
training_args = TrainingArguments(
    output_dir=models_dir,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

Cloning https://huggingface.co/damapika/roberta-base_mod into local empty directory.
Download file pytorch_model.bin:   0%|          | 24.5k/473M [00:01<8:10:10, 16.9kB/s]
[A

[A[A


[A[A[A



[A[A[A[A




Download file pytorch_model.bin:  99%|█████████▉| 468M/473M [00:20<00:00, 26.2MB/s]   





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





Download file pytorch_model.bin: 100%|██████████| 473M/473M [00:30<00:00, 26.2MB/s]





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





[A[A[A[A[A[A





Download file pytorch_model.bin: 100%|██████████| 473M/473M [00:40<00:00, 12.4MB/s]
Download file training_args.bin: 100%|██████████| 3.56k/3.56k [00:39<?, ?B/s]

[A
Down

In [9]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mdamapika[0m. Use [1m`wandb login --relogin`[0m to force relogin


 14%|█▎        | 500/3639 [16:03<1:17:02,  1.47s/it] 

{'loss': 2.3474, 'learning_rate': 2.5877988458367684e-05, 'epoch': 0.41}


 27%|██▋       | 1000/3639 [29:37<1:05:27,  1.49s/it]

{'loss': 1.6272, 'learning_rate': 2.1755976916735367e-05, 'epoch': 0.82}


 33%|███▎      | 1213/3639 [35:27<50:08,  1.24s/it]  
 33%|███▎      | 1213/3639 [37:05<50:08,  1.24s/it]

{'eval_loss': 1.4654324054718018, 'eval_runtime': 98.187, 'eval_samples_per_second': 24.626, 'eval_steps_per_second': 1.548, 'epoch': 1.0}


 41%|████      | 1500/3639 [45:11<56:59,  1.60s/it]   

{'loss': 1.2536, 'learning_rate': 1.763396537510305e-05, 'epoch': 1.24}


 55%|█████▍    | 2000/3639 [1:00:21<59:34,  2.18s/it]  

{'loss': 1.0583, 'learning_rate': 1.3511953833470735e-05, 'epoch': 1.65}


 67%|██████▋   | 2426/3639 [1:28:11<27:00,  1.34s/it]    
 67%|██████▋   | 2426/3639 [1:29:38<27:00,  1.34s/it]

{'eval_loss': 1.4134212732315063, 'eval_runtime': 87.0611, 'eval_samples_per_second': 27.774, 'eval_steps_per_second': 1.746, 'epoch': 2.0}


 69%|██████▊   | 2500/3639 [1:31:42<30:34,  1.61s/it]  

{'loss': 0.9832, 'learning_rate': 9.389942291838417e-06, 'epoch': 2.06}


 82%|████████▏ | 3000/3639 [1:46:13<17:00,  1.60s/it]  

{'loss': 0.7188, 'learning_rate': 5.267930750206101e-06, 'epoch': 2.47}


 96%|█████████▌| 3500/3639 [2:00:05<03:42,  1.60s/it]  

{'loss': 0.6854, 'learning_rate': 1.145919208573784e-06, 'epoch': 2.89}


100%|██████████| 3639/3639 [2:04:13<00:00,  1.35s/it]
100%|██████████| 3639/3639 [2:05:28<00:00,  2.07s/it]

{'eval_loss': 1.5400279760360718, 'eval_runtime': 75.6131, 'eval_samples_per_second': 31.979, 'eval_steps_per_second': 2.01, 'epoch': 3.0}
{'train_runtime': 7531.7793, 'train_samples_per_second': 7.727, 'train_steps_per_second': 0.483, 'train_loss': 1.2186447462232337, 'epoch': 3.0}





TrainOutput(global_step=3639, training_loss=1.2186447462232337, metrics={'train_runtime': 7531.7793, 'train_samples_per_second': 7.727, 'train_steps_per_second': 0.483, 'train_loss': 1.2186447462232337, 'epoch': 3.0})

In [10]:
trainer.push_to_hub()

Upload file pytorch_model.bin: 488MB [00:45, 13.5MB/s]                            To https://huggingface.co/damapika/roberta-base_mod
   498b557..185f8b2  main -> main

Upload file pytorch_model.bin: 100%|██████████| 473M/473M [00:46<00:00, 10.8MB/s]
Upload file runs/May18_11-49-05_Damapika/events.out.tfevents.1684403400.Damapika.30672.0: 100%|██████████| 6.18k/6.18k [00:46<00:00, 137B/s] 
To https://huggingface.co/damapika/roberta-base_mod
   185f8b2..ba9f726  main -> main



'https://huggingface.co/damapika/roberta-base_mod/commit/185f8b2a6bf3c574dd723f8ea4303663196f482f'

In [None]:
question = "Who is Fyodor Dostoevsky?"
context = "In the world of literature, there have been many authors who have gained a reputation for their ability to create complex characters. One such author is Fyodor Dostoevsky, a Russian novelist who wrote several influential works in the 19th century."

In [None]:
question_answerer = transformers.pipeline("question-answering", model="damapika/roberta-base_mod")
question_answerer(question=question, context=context)