In [1]:
import transformers 
import datasets
import torch
from torch.utils.data import Dataset
import logging
from transformers import TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = datasets.load_dataset("quoref")

Found cached dataset quoref (C:/Users/dama_/.cache/huggingface/datasets/quoref/default/0.1.0/82bb58a6b25cd8dbb4625a7ba6a5d0a224af1f4d392ca0de8b9e0c23e78557fe)
100%|██████████| 2/2 [00:00<00:00, 31.22it/s]


In [5]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'context', 'title', 'url', 'answers'],
        num_rows: 19399
    })
    validation: Dataset({
        features: ['id', 'question', 'context', 'title', 'url', 'answers'],
        num_rows: 2418
    })
})

In [13]:
# Preprocessing
dataset_name = "quoref" 
model_type="roberta"
model_name= "damapika/roberta-base_mod_squad"
models_dir = "saved_models/roberta-base_mod_quoref"
checkpoint = 'roberta-base'
max_input_length = 308


# ## Training
learning_rate = 3e-5
num_epochs = 3

In [4]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)
model = transformers.AutoModelForQuestionAnswering.from_pretrained(model_name)

In [5]:
# calculate max context length for dataset
def calc_max_len(dataset):
  context_length_max=len(dataset[0]['context'])
  for i in range(len(dataset)):
    con_len=len(dataset[i]['context'])
    if(con_len<context_length_max):
      context_length_max=con_len
      print(context_length_max)
      print(dataset[i]['context'])
  return context_length_max

In [6]:
calc_max_len(dataset['validation'])

1410
Set 19 years after the events of the first film, the movie deals with unresolved conflict and family strain, and also has elements of a coming of age story. Michael Goorjian reprises his role of Heroin Bob, and acts as a narrator for the film, both in voice over as well as sporadically intercut scenes of him in the afterlife
Ross is the child of Trish and Heroin Bob, being conceived shortly before Bob's accidental drug overdose. Ross has been raised by alone by Trish, above her steam punk curio and clothing boutique, and as a result of his mothers adoration of the macabre, as well as his immersion in the concept of death from a young age, he develops into a Victorian Goth. Despite his obvious affiliation, Ross insists that he is part of no social cliques, which is stressed even further when he states that despite a lifelong abstinence from drugs, alcohol, and sex; he is not Straight Edge either. 
Upon having his heart broken by his first girlfriend, Ross attempts to drown his sorr

822

In [106]:
dataset['train'][0]

{'id': 'ba3f052c7a557909526b59713430403dd134e01d',
 'question': 'What is the first name of the person who doubted it would turn out to be a highly explosive eruption like those that can occur in subduction-zone volcanoes?',
 'title': '2007–2008 Nazko earthquakes 1',
 'url': 'https://en.wikipedia.org/wiki/2007%E2%80%932008_Nazko_earthquakes',
 'answers': {'answer_start': [250], 'text': ['Catherine']}}

In [14]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_input_length ,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [15]:
tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=dataset['train'].column_names)

Loading cached processed dataset at C:\Users\dama_\.cache\huggingface\datasets\quoref\default\0.1.0\82bb58a6b25cd8dbb4625a7ba6a5d0a224af1f4d392ca0de8b9e0c23e78557fe\cache-97bdbc22183b6cde.arrow


In [16]:
data_collator = transformers.DefaultDataCollator()

In [10]:
torch.cuda.empty_cache()


In [11]:
# Check if CUDA is available
if torch.cuda.is_available():
    # Set the device to CUDA
    device = torch.device('cuda')
    print('gpu')
else:
    # If CUDA is not available, fall back to CPU
    device = torch.device('cpu')
    print('cpu')

gpu


In [17]:
training_args = TrainingArguments(
    output_dir=models_dir,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

Cloning https://huggingface.co/damapika/roberta-base_mod_quoref into local empty directory.


In [21]:
import wandb
wandb.init() 

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\dama_/.netrc


In [22]:
trainer.train()

 14%|█▎        | 500/3639 [08:19<52:16,  1.00it/s]

[A                                               

{'loss': 1.1823, 'learning_rate': 2.1755976916735367e-05, 'epoch': 0.41}



[A                                                

{'loss': 1.1263, 'learning_rate': 1.763396537510305e-05, 'epoch': 0.82}


                                                 
                                                   

{'eval_loss': 1.266489028930664, 'eval_runtime': 22.7465, 'eval_samples_per_second': 106.302, 'eval_steps_per_second': 6.682, 'epoch': 1.0}



[A                                                

{'loss': 0.8973, 'learning_rate': 1.3511953833470735e-05, 'epoch': 1.24}



[A                                                

{'loss': 0.7404, 'learning_rate': 9.389942291838417e-06, 'epoch': 1.65}


                                                 
                                                   

{'eval_loss': 1.3566540479660034, 'eval_runtime': 23.1232, 'eval_samples_per_second': 104.57, 'eval_steps_per_second': 6.573, 'epoch': 2.0}



[A                                                

{'loss': 0.687, 'learning_rate': 5.267930750206101e-06, 'epoch': 2.06}



[A                                                

{'loss': 0.4962, 'learning_rate': 1.145919208573784e-06, 'epoch': 2.47}



[A                                                

{'loss': 0.5172, 'learning_rate': 0.0, 'epoch': 2.89}


                                                 
                                                   
100%|██████████| 3639/3639 [30:58<00:00,  1.96it/s]

{'eval_loss': 1.5566297769546509, 'eval_runtime': 22.9847, 'eval_samples_per_second': 105.2, 'eval_steps_per_second': 6.613, 'epoch': 3.0}
{'train_runtime': 1858.679, 'train_samples_per_second': 31.311, 'train_steps_per_second': 1.958, 'train_loss': 0.7955441685881252, 'epoch': 3.0}





TrainOutput(global_step=3639, training_loss=0.7955441685881252, metrics={'train_runtime': 1858.679, 'train_samples_per_second': 31.311, 'train_steps_per_second': 1.958, 'train_loss': 0.7955441685881252, 'epoch': 3.0})

In [24]:
trainer.push_to_hub()

Several commits (2) will be pushed upstream.
The progress bars may be unreliable.
To https://huggingface.co/damapika/roberta-base_mod_quoref
   9d91ef2..ec50c72  main -> main



In [None]:
question = "Who is Fyodor Dostoevsky?"
context = "In the world of literature, there have been many authors who have gained a reputation for their ability to create complex characters. One such author is Fyodor Dostoevsky, a Russian novelist who wrote several influential works in the 19th century."

In [None]:
question_answerer = transformers.pipeline("question-answering", model="damapika/roberta-base_mod")
question_answerer(question=question, context=context)