In [9]:
import transformers 
import datasets
import torch
import logging
import json
import pandas as pd
from torch.utils.data import Subset

In [10]:
class MyDataset:
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data[index]
        return {
            'id': row['id'],
            'question': row['question'],
            'answer': row['answer'],
            'context': row['context'],
            'answer_start': row['answer_start'],
            'answer_end': row['answer_end']
        }

    def get(self, index):
        return self.__getitem__(index)

    def set(self, index, id=None, question=None, answer=None, context=None, answer_start=None, answer_end=None):
        if id is not None:
            self.data[index]['id'] = id
        if question is not None:
            self.data[index]['question'] = question
        if answer is not None:
            self.data[index]['answer'] = answer
        if context is not None:
            self.data[index]['context'] = context
        if answer_start is not None:
            self.data[index]['answer_start'] = answer_start
        if answer_end is not None:
            self.data[index]['answer_end'] = answer_end

In [43]:
dataset=torch.load("../datasets/ms-marco_train_qa.pt")

In [46]:
dataset=dataset.train_test_split(test_size=0.3)

In [49]:
dataset_train=dataset['train']

In [60]:
dataset_train[0]

{'id': 414389,
 'question': 'is it possible to explain colors',
 'answer': 'Yes, it is possible to explain color to blind person.',
 'context': "Hazel eyes often have a mixture of green, brown and amber hues. Their color can be hard to describe, since ambient lighting and clothing colors can affect your perception of them. [Enlarge] But it turns out the story is more complicated than that. But even in a sea of quirk, one question stood out. Spirit Airlines asks candidates to describe the color yellow to somebody who's blind. On Glassdoor, people chimed in with suggestions on how to answer: Yellow is the warm sun while a cool breeze blows on your face. Yellow is exciting without being loud or angry.. It is a warm, soft color, like a baby chick, or the warmth of sunlight in springtime streaming through a window warming up a patch on the carpet.. The science of color is sometimes called chromatics, colorimetry, or simply color science. It includes the perception of color by the human eye 

In [51]:
dataset=dataset['test'].train_test_split(test_size=0.5)

In [53]:
dataset_valid=dataset['train']

In [59]:
dataset_valid[0]

{'id': 137123,
 'question': 'definition of pika',
 'answer': 'A small mammal, with short limbs, very round body, rounded ears, and no external tail.',
 'context': "A review of the person's eating habits also may be conducted. Before making a diagnosis of pica, the doctor will evaluate the presence of other disorders -- such as mental retardation, developmental disabilities, or obsessive-compulsive disorder -- as the cause of the odd eating behavior. In this article. Pica is the persistent eating of substances such as dirt or paint that have no nutritional value. The Handbook of Clinical Child Psychology currently estimates that prevalence rates of pica range from 4%-26% among institutionalized populations. In the United States, the pika is colloquially called a coney, a nonspecific term also used for rabbits, hares and hyraxes. Pica (/ˈpaɪkə/ PY-kə) is characterized by an appetite for substances that are largely non-nutritive, such as paper, clay, metal, chalk, soil, glass, or sand. A 

In [57]:
dataset_test=dataset['test']

In [58]:
dataset_test[0]

{'id': 682239,
 'question': 'what is a dss?',
 'answer': 'Decision support system is designed to provide effective analysis relevant to specific situations while an MIS management information system is designed for the efficient processing of data or information.',
 'context': "HHS oversees programs and services that improve the well-being of individuals, families, and communities. Home visiting programs improve the health of at-risk children by reaching pregnant women, expectant fathers, and parents and caregivers of children under five. There are many reasons people say no DSS in adverts. One of the most massive is because the landlord of the property is on the sick, and cannot have the DSS, or DWP as it is now known, cross referencing and then realising the landlord is dabbling in benefit fraud. NC Division of Social Services 2401 Mail Service Center Raleigh, NC 27699-2401. Contact Social Services When renting a property what does DSS mean? im looking at prices for rental in cumbria

In [4]:
squad=torch.load("../datasets/squad.pt")

In [14]:
squad['train']

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 87599
})

In [61]:
# calculate max context length for dataset
def calc_max_len(dataset):
  context_length_max=0
  for i in range(len(dataset)):
    con_len=len(dataset[i]['context'])
    if(con_len>context_length_max):
      context_length_max=con_len
  return context_length_max


In [74]:
# Preprocessing
# sep_token = '<sep>'
dataset_name = "ms_marco"
model_type="roberta"
model_name= "roberta-base"
models_dir = "saved_models/roberta-base_ms-marco_mod"
checkpoint = 'roberta-base'
max_input_length = 512


# ## Training
learning_rate = 3e-5
num_epochs = 3

In [65]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

In [66]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_input_length ,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answer"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = examples["answer_start"][0]
        end_char = examples["answer_end"][0]
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [11]:
# Transform your custom dataset to a PyTorch dataset
dataset = datasets.Dataset.from_generator(
    generator=lambda: iter(dataset)
)

Found cached dataset generator (C:/Users/dama_/.cache/huggingface/datasets/generator/default-1493d1b5d27ca61a/0.0.0)


In [12]:
# dataset_val= datasets.Dataset.from_generator(
#     generator=lambda: iter(dataset_valid))

Downloading and preparing dataset generator/default to C:/Users/dama_/.cache/huggingface/datasets/generator/default-1f482045682b315d/0.0.0...


                                                                    

Dataset generator downloaded and prepared to C:/Users/dama_/.cache/huggingface/datasets/generator/default-1f482045682b315d/0.0.0. Subsequent calls will reuse this data.


In [16]:
# torch.save(dataset,"../datasets/ms-marco_train_qa.pt")

In [17]:
# torch.save(dataset_valid,"../datasets/ms-marco_valid_qa.pt")# 

In [51]:
dataset

Dataset({
    features: ['id', 'question', 'answer', 'context', 'answer_start', 'answer_end'],
    num_rows: 55578
})

In [68]:
tokenized_msmarco_train = dataset_train.map(preprocess_function, batched=True, remove_columns=dataset_train.column_names)

                                                                    

In [69]:
tokenized_msmarco_val=dataset_valid.map(preprocess_function, batched=True, remove_columns=dataset_valid.column_names)

                                                                  

In [70]:
data_collator = transformers.DefaultDataCollator()

In [71]:
model = transformers.AutoModelForQuestionAnswering.from_pretrained(model_name)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForQuestionAnswering: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use 

In [75]:
training_args = transformers.TrainingArguments(
    output_dir=models_dir,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_msmarco_train,
    eval_dataset=tokenized_msmarco_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

e:\HOGENT\2022_2023\BA\BP_Info_Support\bert\saved_models/roberta-base_ms-marco_mod is already a clone of https://huggingface.co/damapika/roberta-base_ms-marco_mod. Make sure you pull the latest changes with `repo.git_pull()`.


In [76]:
trainer.train()



RuntimeError: The expanded size of the tensor (9173) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [16, 9173].  Tensor sizes: [1, 514]