In [1]:
import transformers 
import datasets
import torch
import logging
import json
import pandas as pd
from torch.utils.data import Subset

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class MyDataset:
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        row = self.data[index]
        return {
            'id': row['id'],
            'question': row['question'],
            'answer': row['answer'],
            'context': row['context'],
            'answer_start': row['answer_start'],
            'answer_end': row['answer_end']
        }

    def get(self, index):
        return self.__getitem__(index)

    def set(self, index, id=None, question=None, answer=None, context=None, answer_start=None, answer_end=None):
        if id is not None:
            self.data[index]['id'] = id
        if question is not None:
            self.data[index]['question'] = question
        if answer is not None:
            self.data[index]['answer'] = answer
        if context is not None:
            self.data[index]['context'] = context
        if answer_start is not None:
            self.data[index]['answer_start'] = answer_start
        if answer_end is not None:
            self.data[index]['answer_end'] = answer_end

In [3]:
dataset=torch.load("../datasets/ms-marco_train_qa.pt")

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answer', 'context', 'answer_start', 'answer_end'],
        num_rows: 301763
    })
    test: Dataset({
        features: ['id', 'question', 'answer', 'context', 'answer_start', 'answer_end'],
        num_rows: 201176
    })
})

In [5]:
dataset=dataset.train_test_split(test_size=0.4)

In [7]:
dataset_train=dataset['train']

In [12]:
dataset_train[0]

{'id': 584861,
 'question': 'what causes a rotator cuff tear',
 'answer': 'Bleeding and inflammation.',
 'context': "Print. The rotator cuff is a group of muscles and tendons that surround the shoulder joint, keeping the head of your upper arm bone firmly within the shallow socket of the shoulder. A rotator cuff injury can cause a dull ache in the shoulder, which often worsens when you try to sleep on the involved side.he rotator cuff is a group of muscles and tendons that surround the shoulder joint, keeping the head of your upper arm bone firmly within the shallow socket of the shoulder. The rotator cuff is a group of muscles and tendons that surround the shoulder joint, keeping the head of your upper arm bone firmly within the shallow socket of the shoulder.A rotator cuff injury can cause a dull ache in the shoulder, which often worsens when you try to sleep on the involved side.he rotator cuff is a group of muscles and tendons that surround the shoulder joint, keeping the head of y

In [8]:
dataset=dataset['test'].train_test_split(test_size=0.5)

In [10]:
dataset_valid=dataset['train']

In [11]:
dataset_valid[0]

{'id': 116924,
 'question': 'define  etymological derivation',
 'answer': 'The \u200bstudy of the \u200borigin and \u200bhistory of words.',
 'context': "Online Language Dictionaries. English definition English thesaurus English-Spanish English-French English-Italian Spanish-English French-English Italian-English Espanol-Español Francais-Français Francais-Français Espanol-Español Espanol: español Portugues: português Portugues: português espanol... español espanol español definicion definición espanol español sinonimos sinónimos catala català definicio definició more etymology. n, pl-gies. 1. (Linguistics) the study of the sources and development of words and morphemes. 2. (Linguistics) an account of the source and development of a word or morpheme. [C14: via Latin from Greek etumologia; see etymon, -logy]. etymology noun [C or U]. › the \u200bstudy of the \u200borigin and \u200bhistory of words, or a \u200bstudy of this \u200btype \u200brelating to one \u200bparticular word: At \u200b

In [13]:
dataset_test=dataset['test']

In [18]:
dataset_test

Dataset({
    features: ['id', 'question', 'answer', 'context', 'answer_start', 'answer_end'],
    num_rows: 100588
})

In [4]:
squad=torch.load("../datasets/squad.pt")

In [14]:
squad['train']

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 87599
})

In [23]:
# calculate max context length for dataset
def calc_max_len(dataset):
  context_length_max=len(dataset[0]['context'])
  for i in range(len(dataset)):
    con_len=len(dataset[i]['context'])
    if(con_len<context_length_max):
      context_length_max=con_len
      print(context_length_max)
      print(dataset[i]['context'])
  return context_length_max


In [24]:
calc_max_len(dataset_train)

5362
Relevance. Rating Newest Oldest. Best Answer: SUCROSE is the scientific name of table sugar, is a disaccharide (glucose + fructose) with the molecular formula C12H22O11. Its systematic name is α-D-glucopyranosyl-(1→2)-β-D-fructofuran...It is best known for its role in human nutrition and is formed by plants but not by higher organisms.ts systematic name is α-D-glucopyranosyl-(1→2)-β-D-fructofuran... It is best known for its role in human nutrition and is formed by plants but not by higher organisms. Best Answer: SUCROSE is the scientific name of table sugar, is a disaccharide (glucose + fructose) with the molecular formula C12H22O11.Its systematic name is α-D-glucopyranosyl-(1→2)-β-D-fructofuran... It is best known for its role in human nutrition and is formed by plants but not by higher organisms.ts systematic name is α-D-glucopyranosyl-(1→2)-β-D-fructofuran... It is best known for its role in human nutrition and is formed by plants but not by higher organisms. Answer by Dudeboy3

301

In [25]:
# Preprocessing
# sep_token = '<sep>'
dataset_name = "ms_marco"
model_type="roberta"
model_name= "roberta-base"
models_dir = "saved_models/roberta-base_ms-marco_mod"
checkpoint = 'roberta-base'
max_input_length = 301


# ## Training
learning_rate = 3e-5
num_epochs = 2

In [26]:
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

In [45]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_input_length ,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answer"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = examples["answer_start"]
        print(len(start_char))
        
        end_char = examples["answer_end"]
        print(len(end_char))
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        print(offset[context_start][0])
        print(end_char)
        print(offset[context_end][1])
        print(start_char)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [None]:
# Transform your custom dataset to a PyTorch dataset
# dataset = datasets.Dataset.from_generator(
#     generator=lambda: iter(dataset)
# )

In [12]:
# dataset_val= datasets.Dataset.from_generator(
#     generator=lambda: iter(dataset_valid))

Downloading and preparing dataset generator/default to C:/Users/dama_/.cache/huggingface/datasets/generator/default-1f482045682b315d/0.0.0...


                                                                    

Dataset generator downloaded and prepared to C:/Users/dama_/.cache/huggingface/datasets/generator/default-1f482045682b315d/0.0.0. Subsequent calls will reuse this data.


In [16]:
# torch.save(dataset,"../datasets/ms-marco_train_qa.pt")

In [17]:
# torch.save(dataset_valid,"../datasets/ms-marco_valid_qa.pt")# 

In [29]:
dataset_train

Dataset({
    features: ['id', 'question', 'answer', 'context', 'answer_start', 'answer_end'],
    num_rows: 301763
})

In [38]:
dataset_train[0]

{'id': 584861,
 'question': 'what causes a rotator cuff tear',
 'answer': 'Bleeding and inflammation.',
 'context': "Print. The rotator cuff is a group of muscles and tendons that surround the shoulder joint, keeping the head of your upper arm bone firmly within the shallow socket of the shoulder. A rotator cuff injury can cause a dull ache in the shoulder, which often worsens when you try to sleep on the involved side.he rotator cuff is a group of muscles and tendons that surround the shoulder joint, keeping the head of your upper arm bone firmly within the shallow socket of the shoulder. The rotator cuff is a group of muscles and tendons that surround the shoulder joint, keeping the head of your upper arm bone firmly within the shallow socket of the shoulder.A rotator cuff injury can cause a dull ache in the shoulder, which often worsens when you try to sleep on the involved side.he rotator cuff is a group of muscles and tendons that surround the shoulder joint, keeping the head of y

In [46]:
tokenized_msmarco_train = dataset_train.map(preprocess_function, batched=True, remove_columns=dataset_train.column_names)



1000
1000
0
[915, 1116, 806, 523, 1064, 376, 256, 1287, 686, 212, 613, 157, 594, 950, 264, 541, 708, 191, 569, 1109, 641, 904, 658, 1368, 265, 645, 601, 1133, 614, 485, 626, 587, 599, 756, 343, 1096, 643, 472, 542, 424, 466, 538, 598, 1213, 487, 612, 356, 1178, 180, 560, 569, 272, 1196, 589, 249, 522, 986, 347, 500, 913, 733, 791, 631, 587, 602, 937, 270, 711, 397, 691, 299, 854, 635, 495, 517, 377, 537, 512, 603, 1106, 375, 450, 483, 525, 616, 589, 631, 445, 491, 451, 561, 611, 1055, 634, 425, 500, 1172, 998, 509, 644, 1112, 741, 569, 1144, 579, 678, 600, 1081, 484, 805, 744, 476, 659, 504, 294, 371, 776, 792, 1048, 937, 1006, 457, 557, 427, 1145, 693, 904, 552, 521, 350, 1130, 907, 1123, 241, 132, 677, 345, 504, 684, 684, 301, 540, 605, 1010, 260, 1041, 961, 487, 518, 393, 611, 410, 655, 569, 358, 1032, 630, 647, 1286, 1166, 283, 676, 256, 262, 618, 867, 500, 900, 960, 596, 234, 1129, 1118, 510, 264, 592, 310, 308, 690, 251, 296, 297, 575, 900, 522, 562, 559, 1081, 632, 536, 898, 733

TypeError: '>' not supported between instances of 'int' and 'list'

In [69]:
tokenized_msmarco_val=dataset_valid.map(preprocess_function, batched=True, remove_columns=dataset_valid.column_names)

                                                                  

In [70]:
data_collator = transformers.DefaultDataCollator()

In [71]:
model = transformers.AutoModelForQuestionAnswering.from_pretrained(model_name)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForQuestionAnswering: ['lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForQuestionAnswering were not initialized from the model checkpoint at roberta-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use 

In [75]:
training_args = transformers.TrainingArguments(
    output_dir=models_dir,
    evaluation_strategy="epoch",
    learning_rate=learning_rate,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    push_to_hub=True,
)

trainer = transformers.Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_msmarco_train,
    eval_dataset=tokenized_msmarco_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

e:\HOGENT\2022_2023\BA\BP_Info_Support\bert\saved_models/roberta-base_ms-marco_mod is already a clone of https://huggingface.co/damapika/roberta-base_ms-marco_mod. Make sure you pull the latest changes with `repo.git_pull()`.


In [76]:
trainer.train()



RuntimeError: The expanded size of the tensor (9173) must match the existing size (514) at non-singleton dimension 1.  Target sizes: [16, 9173].  Tensor sizes: [1, 514]