In [224]:
from datasets import load_dataset, load_metric, ClassLabel
import random
import pandas as pd
import json
from transformers import AutoTokenizer
import os


train_set = "train.csv"
dev_set = "dev.csv"
info_file = "info.json"
# dir_path = os.path.dirname(os.path.abspath(__file__))
dir_path = 'D:/MRC/MRC_CSK_pretraining/dataset'
cls_task_token = "madeupword0002"
mc_token="madeupword0001"
span_token="madeupword0000"
task_types = ["span", "mc", "cls"]

train_key = "train"
eval_key = "eval"

class Dataset :
    def __init__(self, task, dataroot, split, task_type, task_category, task_choices, max_seq_length, tokenizer):
        path = dir_path + "/" + dataroot + '/'
        self.datasets = load_dataset('csv', data_files={train_key: path + train_set, eval_key : path + dev_set})
        self.configs = None
        with open(path + info_file) as json_file:
            self.configs = json.load(json_file)
        self.task = task
        self.task_type = task_type
        self.task_category = task_category
        self.split = split
        self.max_seq_length = max_seq_length
        self.tokenizer = tokenizer
        self.task_choices = task_choices

    def __call__(self):
        self.datasets = self.datasets.map(self.preprocess_function, batched=True)
        return self.datasets

class CoLADataset(Dataset) :
    def __init__(self, task, dataroot, split, task_type, task_category, task_choices, max_seq_length, tokenizer):
        super().__init__(task, dataroot, split, task_type, task_category, task_choices, max_seq_length, tokenizer)

    def preprocess_function(self, examples):
        sentences = [cls_task_token + sentence for sentence in examples["sentence"]]
        examples['class'] = [self.task] * len(examples["sentence"])
        tokenized_examples = self.tokenizer(sentences, truncation=True)

        return {k : v for k, v in tokenized_examples.items()}

class CommonsenseQADataset(Dataset) :
    def __init__(self, task, dataroot, split, task_type, task_category, task_choices, max_seq_length, tokenizer):
        super().__init__(task, dataroot, split, task_type, task_category, task_choices, max_seq_length, tokenizer)
        self.choice_names = ["answerA", "answerB", "answerC", "answerD", "answerE"]
        
    def preprocess_function(self, examples):
        first_sentences = [[mc_token + question] * 5 for i, question in enumerate(examples["question"])]
        question_headers = examples["question"]
        second_sentences = [[f"{examples[choice][i]}" for choice in self.choice_names] for i, header in enumerate(question_headers)]
        examples['class'] = [self.task] * len(examples["question"])

        # Flatten
        first_sentences = sum(first_sentences, [])
        second_sentences = sum(second_sentences, [])
        
        tokenized_examples = self.tokenizer(first_sentences, second_sentences, truncation=True)

        # Un-flatten
        mapped_result = {k: [v[i:i+5] for i in range(0, len(v), 5)] for k, v in tokenized_examples.items()}
        return mapped_result

class SocialIQADataset(Dataset) :
    def __init__(self, task, dataroot, split, task_type, task_category, task_choices, max_seq_length, tokenizer):
        super().__init__(task, dataroot, split, task_type, task_category, task_choices, max_seq_length, tokenizer)
        self.choice_names = ["answerA", "answerB", "answerC"]
        
    def preprocess_function(self, examples):
        first_sentences = [[mc_token + question] * 3 for i, question in enumerate(examples["Context"])]
        question_headers = examples["Context"]
        second_sentences = [[f"{examples[choice][i]}" for choice in self.choice_names] for i, header in enumerate(question_headers)]
        examples['class'] = [self.task] * len(examples["Context"])

        # Flatten
        first_sentences = sum(first_sentences, [])
        second_sentences = sum(second_sentences, [])
        
        tokenized_examples = self.tokenizer(first_sentences, second_sentences, truncation=True)

        # Un-flatten
        mapped_result = {k: [v[i:i+3] for i in range(0, len(v), 3)] for k, v in tokenized_examples.items()}
        return mapped_result

class MultiRCDataset(Dataset) :
    #passage,question,answer,label
    def __init__(self, task, dataroot, split, task_type, task_category, task_choices, max_seq_length, tokenizer):
        super().__init__(task, dataroot, split, task_type, task_category, task_choices, max_seq_length, tokenizer)
        
    def preprocess_function(self, examples):
        passage_question = [cls_task_token + examples["question"][i] + "\n" + passage  for i, passage in enumerate(examples["passage"])]
        second_answer = [str(answer) for answer in examples["answer"]]
        examples['class'] = [self.task] * len(examples["question"])

        tokenized_examples = self.tokenizer(passage_question, second_answer, truncation=True)
        return {k : v for k, v in tokenized_examples.items()}

class SQuadDataset(Dataset) :
    def __init__(self, task, dataroot, split, task_type, task_category, task_choices, max_seq_length, tokenizer) :
        super().__init__(task, dataroot, split, task_type, task_category, task_choices, max_seq_length, tokenizer) 
    
    def preprocess_function(self, examples) :
        examples["question"] = [q.lstrip() for q in examples["question"]]
        context_question = [span_token + examples["question"][i] + "\n" + context for i, context in enumerate(examples["context"])]

        tokenized_examples = tokenizer(
            context_question,
            truncation=True,
            max_length=max_seq_length,
            return_overflowing_tokens=True, # tokens overlapeed with doc stride
            return_offsets_mapping=True, # 
        )
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") 
        offset_mapping = tokenized_examples.pop("offset_mapping")

        # Let's label those examples!
        tokenized_examples["start_positions"] = []
        tokenized_examples["end_positions"] = []

        for i, offsets in enumerate(offset_mapping):
            input_ids = tokenized_examples["input_ids"][i]
            cls_index = input_ids.index(tokenizer.cls_token_id)

            sequence_ids = tokenized_examples.sequence_ids(i)

            sample_index = sample_mapping[i]
            answers = eval(examples["answers"][sample_index])
            
            # If no answers are given, set the cls_index as answer.
            if len(answers["answer_start"]) == 0:
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Start/end character index of the answer in the text.
                start_char = answers["answer_start"][0]
                end_char = start_char + len(answers["text"][0])

                # Start token index of the current span in the text.
                token_start_index = 0
                while sequence_ids[token_start_index] != 0 :
                    token_start_index += 1

                # End token index of the current span in the text.
                token_end_index = len(input_ids) - 1
                while sequence_ids[token_end_index] != 0 :
                    token_end_index -= 1

                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                    tokenized_examples["start_positions"].append(cls_index)
                    tokenized_examples["end_positions"].append(cls_index)
                else:
                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                    # Note: we could go after the last offset if the answer is the last word (edge case).
                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                        token_start_index += 1
                    tokenized_examples["start_positions"].append(token_start_index - 1)
                    while offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    tokenized_examples["end_positions"].append(token_end_index + 1)

        return tokenized_examples

        

DatasetFactory = {
    "CoLA": CoLADataset,
    "MultiRC": MultiRCDataset,
    "SocialIQA": SocialIQADataset,
    "CommonsenseQA": CommonsenseQADataset,
    "Squad1.1": SQuadDataset
}




In [37]:
from datasets import load_dataset

In [132]:
dataset = load_dataset('csv', data_files='./mrc/squad1.1/train.csv')

Using custom data configuration default-671bad6838d17224
Reusing dataset csv (C:\Users\user\.cache\huggingface\datasets\csv\default-671bad6838d17224\0.0.0\9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff)


In [None]:
def preprocess_function(self, examples) :
        examples["question"] = [q.lstrip() for q in examples["question"]]
        context_question = [span_token + examples["question"][i] + "\n" + context for i, context in enumerate(examples["context"])]

        tokenized_examples = tokenizer(
            context_question,
            truncation=True,
            max_length=max_seq_length,
            return_overflowing_tokens=True, # tokens overlapeed with doc stride
            return_offsets_mapping=True, # 
        )
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") 
        offset_mapping = tokenized_examples.pop("offset_mapping")

        # Let's label those examples!
        tokenized_examples["start_positions"] = []
        tokenized_examples["end_positions"] = []

        for i, offsets in enumerate(offset_mapping):
            input_ids = tokenized_examples["input_ids"][i]
            cls_index = input_ids.index(tokenizer.cls_token_id)

            sequence_ids = tokenized_examples.sequence_ids(i)

            sample_index = sample_mapping[i]
            answers = eval(examples["answers"][sample_index])
            
            # If no answers are given, set the cls_index as answer.
            if len(answers["answer_start"]) == 0:
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Start/end character index of the answer in the text.
                start_char = answers["answer_start"][0]
                end_char = start_char + len(answers["text"][0])

                # Start token index of the current span in the text.
                token_start_index = 0
                while sequence_ids[token_start_index] != 0 :
                    token_start_index += 1

                # End token index of the current span in the text.
                token_end_index = len(input_ids) - 1
                while sequence_ids[token_end_index] != 0 :
                    token_end_index -= 1

                # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                    tokenized_examples["start_positions"].append(cls_index)
                    tokenized_examples["end_positions"].append(cls_index)
                else:
                    # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                    # Note: we could go after the last offset if the answer is the last word (edge case).
                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                        token_start_index += 1
                    tokenized_examples["start_positions"].append(token_start_index - 1)
                    while offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    tokenized_examples["end_positions"].append(token_end_index + 1)

        return tokenized_examples

In [188]:
from typing import List, Optional
from transformers import RobertaTokenizerFast, RobertaTokenizer, AddedToken

class RobertaMuppetTokenizerFast(RobertaTokenizerFast):

    def __init__(
        self,
        vocab_file,
        merges_file,
        errors="replace",
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        span_token="madeupword0000",
        mc_token="madeupword0001",
        cls_task_token = "madeupword0002",
        add_prefix_space=False,
        **kwargs):
        mrc_token = AddedToken(span_token, lstrip=False, rstrip=False) if isinstance(span_token, str) else span_token
        com_token = AddedToken(mc_token, lstrip=False, rstrip=False) if isinstance(mc_token, str) else mc_token
        cls_task_token = AddedToken(cls_task_token, lstrip=False, rstrip=False) if isinstance(cls_task_token, str) else cls_task_token
        special_tokens_dict  = { "additional_special_tokens" : [mrc_token, com_token, cls_task_token]}
        
        super().__init__(
            vocab_file=vocab_file,
            merges_file=merges_file,
            errors=errors,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            cls_task_token=cls_task_token,
            add_prefix_space=add_prefix_space,
            **kwargs,
        )
        self.add_special_tokens(special_tokens_dict)
        #model.resize_token_embeddings(len(tokenizer))

    def build_input_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        print("is silhang?")
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        if len(token_ids_0) > 0 and token_ids_0[0] in self.additional_special_tokens_ids:
            if token_ids_1 is None:
                return token_ids_0 + [self.sep_token_id]
            return token_ids_0 + sep + sep + token_ids_1 + sep
            
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        
        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
    

In [201]:
tokenizer = RobertaMuppetTokenizerFast.from_pretrained("roberta-base")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'RobertaMuppetTokenizerFast'.


In [226]:
class SQuadDataset(Dataset) :
    def __init__(self, task, dataroot, split, task_type, task_category, task_choices, max_seq_length, tokenizer) :
        super().__init__(task, dataroot, split, task_type, task_category, task_choices, max_seq_length, tokenizer) 
    
    def preprocess_function(self, examples) :
        span_token_question = [span_token + examples["question"][i] for i, question in enumerate(examples["question"])]
        tokenized_examples = self.tokenizer(
            span_token_question,
            examples['context'],
            truncation=True,
            max_length=max_seq_length,
            return_overflowing_tokens=True, 
            return_offsets_mapping=True, 
        )
        sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping") 
        offset_mapping = tokenized_examples.pop("offset_mapping")

        tokenized_examples["start_positions"] = []
        tokenized_examples["end_positions"] = []

        for i, offsets in enumerate(offset_mapping):
            input_ids = tokenized_examples["input_ids"][i]
            try :
                span_index = input_ids.index(50261) ## must be modified
            except :
                print(input_ids)
                print(tokenizer.decode(tokenized_examples["input_ids"][i]))
            sequence_ids = tokenized_examples.sequence_ids(i)
            sample_index = sample_mapping[i]
            answers = eval(examples["answers"][sample_index])

            if len(answers["answer_start"]) == 0:
                tokenized_examples["start_positions"].append(span_index)
                tokenized_examples["end_positions"].append(span_index)
            else:
                start_char = answers["answer_start"][0]
                end_char = start_char + len(answers["text"][0])

                token_start_index = 0
                while sequence_ids[token_start_index] != 1 :
                    token_start_index += 1

                token_end_index = len(input_ids) - 1
                while sequence_ids[token_end_index] != 1 :
                    token_end_index -= 1

                if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                    tokenized_examples["start_positions"].append(span_index)
                    tokenized_examples["end_positions"].append(span_index)
                else:
                    while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                        token_start_index += 1
                    tokenized_examples["start_positions"].append(token_start_index - 1)
                    while offsets[token_end_index][1] >= end_char:
                        token_end_index -= 1
                    tokenized_examples["end_positions"].append(token_end_index + 1)

        return {k : v for k, v in tokenized_examples.items()}    
    


        

DatasetFactory = {
    "CoLA": CoLADataset,
    "MultiRC": MultiRCDataset,
    "SocialIQA": SocialIQADataset,
    "CommonsenseQA": CommonsenseQADataset,
    "Squad1.1": SQuadDataset
}

class RobertaMuppetTokenizer(RobertaTokenizerFast):

    def __init__(
        self,
        vocab_file,
        merges_file,
        errors="replace",
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        span_token="madeupword0000",
        mc_token="madeupword0001",
        cls_task_token = "madeupword0002",
        add_prefix_space=False,
        **kwargs):
        mrc_token = AddedToken(span_token, lstrip=False, rstrip=False) if isinstance(span_token, str) else span_token
        com_token = AddedToken(mc_token, lstrip=False, rstrip=False) if isinstance(mc_token, str) else mc_token
        cls_task_token = AddedToken(cls_task_token, lstrip=False, rstrip=False) if isinstance(cls_task_token, str) else cls_task_token
        special_tokens_dict  = { "additional_special_tokens" : [mrc_token, com_token, cls_task_token]}
        
        super().__init__(
            vocab_file=vocab_file,
            merges_file=merges_file,
            errors=errors,
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            cls_task_token=cls_task_token,
            add_prefix_space=add_prefix_space,
            **kwargs,
        )
        self.add_special_tokens(special_tokens_dict)
        #model.resize_token_embeddings(len(tokenizer))

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        if len(token_ids_0) > 0 and token_ids_0[0] in self.additional_special_tokens_ids:
            if token_ids_1 is None:
                return token_ids_0 + [self.sep_token_id]
            return token_ids_0 + sep + sep + token_ids_1 + sep
            
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        
        return cls + token_ids_0 + sep + sep + token_ids_1 + sep



task = "Squad1.1"
#dataroot = "classification/CoLA"
#dataroot = "commonsense/socialIQA"
dataroot = "mrc/squad1.1"
# dataroot = "mrc/multirc"
task_choices = None
max_seq_length = 1024
task_type='span'
task_category='mrc'
tokenizer = RobertaMuppetTokenizer.from_pretrained('roberta-base')
split = "trainval"
squad = DatasetFactory[task](task, dataroot, split, task_type, task_category, task_choices, max_seq_length, tokenizer)
processed = squad()
print(processed['train']['input_ids'][0])
print(tokenizer.decode(processed['train']['input_ids'][0]))
print("hello")


The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'RobertaMuppetTokenizer'.
Using custom data configuration default-9e7326804edaf423


Downloading and preparing dataset csv/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to C:\Users\user\.cache\huggingface\datasets\csv\default-9e7326804edaf423\0.0.0\9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff...


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset csv downloaded and prepared to C:\Users\user\.cache\huggingface\datasets\csv\default-9e7326804edaf423\0.0.0\9144e0a4e8435090117cea53e6c7537173ef2304525df4a077c435d8ee7828ff. Subsequent calls will reuse this data.


HBox(children=(FloatProgress(value=0.0, max=88.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=11.0), HTML(value='')))


[0, 50261, 3972, 2661, 222, 5, 9880, 2708, 2346, 2082, 11, 504, 4432, 11, 226, 2126, 10067, 1470, 116, 2, 2, 37848, 37471, 28108, 6, 5, 334, 34, 10, 4019, 2048, 4, 497, 1517, 5, 4326, 6919, 18, 1637, 31346, 16, 10, 9030, 9577, 9, 5, 9880, 2708, 4, 29261, 11, 760, 9, 5, 4326, 6919, 8, 2114, 24, 6, 16, 10, 7621, 9577, 9, 4845, 19, 3701, 62, 33161, 19, 5, 7875, 22, 39043, 1459, 1614, 1464, 13292, 4977, 845, 4130, 7, 5, 4326, 6919, 16, 5, 26429, 2426, 9, 5, 25095, 6924, 4, 29261, 639, 5, 32394, 2426, 16, 5, 7461, 26187, 6, 10, 19035, 317, 9, 9621, 8, 12456, 4, 85, 16, 10, 24633, 9, 5, 11491, 26187, 23, 226, 2126, 10067, 6, 1470, 147, 5, 9880, 2708, 2851, 13735, 352, 1382, 7, 6130, 6552, 625, 3398, 208, 22895, 853, 1827, 11, 504, 4432, 4, 497, 5, 253, 9, 5, 1049, 1305, 36, 463, 11, 10, 2228, 516, 14, 15230, 149, 155, 19638, 8, 5, 2610, 25336, 238, 16, 10, 2007, 6, 2297, 7326, 9577, 9, 2708, 4, 2]
<s>madeupword0000To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?</s></