In [1]:
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Optional

import datasets
import nltk  # Here to have a nice missing dependency error message early on
import numpy as np
from datasets import load_dataset, load_metric

import transformers
from filelock import FileLock
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoModelForMaskedLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    MBart50Tokenizer,
    MBart50TokenizerFast,
    MBartTokenizer,
    MBartTokenizerFast,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, is_offline_mode, send_example_telemetry
from transformers.utils.versions import require_version


from data import DataCollatorForSeq2SeqWithMultipleReferences
from BSF_Trainer import BSFTrainer
from trainer import CustomTrainer

import traceback


# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
# check_min_version("4.21.0.dev0")

# require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

logger = logging.getLogger(__name__)

try:
    nltk.data.find("tokenizers/punkt")
except (LookupError, OSError):
    if is_offline_mode():
        raise LookupError(
            "Offline mode: run this script without TRANSFORMERS_OFFLINE first to download nltk data files"
        )
    with FileLock(".lock") as lock:
        nltk.download("punkt", quiet=True)

# A list of all multilingual tokenizer which require lang attribute.
MULTILINGUAL_TOKENIZERS = [MBartTokenizer, MBartTokenizerFast, MBart50Tokenizer, MBart50TokenizerFast]

import string
import random


import pandas as pd
import spacy
import random    
import nltk

In [2]:
from datasets import load_from_disk

wiki_datasets_mlm = load_from_disk("C:/.cache/huggingface/datasets/mlm/wiki_mlm")

In [3]:
# Load pretrained model and tokenizer
#
# Distributed training:
# The .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.

model_name_or_path = "facebook/bart-large-xsum"


config = AutoConfig.from_pretrained(
    model_name_or_path,
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path,

    use_fast=False,#model_args.use_fast_tokenizer,
)
#model = AutoModelForSeq2SeqLM.from_pretrained(
model = AutoModelForMaskedLM.from_pretrained(
    model_name_or_path,
    from_tf=bool(".ckpt" in model_name_or_path),
    config=config,
)

NER_MASK = "<mask1>"
NER_TOKEN_MASK = "<mask2>"
MLM_CONNECTOR = "<conn1>"
MLM_SGS_CONNECTOR = "<conn2>"

UNMASKED_TOKEN = "<unmasked>"

tokenizer.add_special_tokens({ "additional_special_tokens": [NER_MASK, NER_TOKEN_MASK, MLM_CONNECTOR, MLM_SGS_CONNECTOR, UNMASKED_TOKEN] })        

model.resize_token_embeddings(len(tokenizer))


Embedding(50270, 1024)

In [11]:
padding=False
max_source_length = 512

inputs = wiki_datasets_mlm.select(range(100))["text"]
labels = wiki_datasets_mlm.select(range(100))["mlm_label"]
model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

with tokenizer.as_target_tokenizer():
    labels = tokenizer(labels, max_length=max_source_length, padding=padding, truncation=True)



'<conn1> <mask1> recognizes achievement for art direction in The categorys original name was <mask1> was changed to its current name in 2012 for <mask1> This change resulted from the Art Directors branch of the Academy of Motion Picture Arts and Sciences AMPAS renamed the Designers branch Since 1947 award is shared with the set decorators It is awarded to the best interior design in a film films below <mask2> <mask2> <mask1> for example <mask1> Award for Best Art Direction is given to film 1999 In the lists below the winner of the award for each year is shown <mask1> by the other nominees in alphabetical Winners <mask2> <mask2> <mask2> <mask1> 1950s <mask1> <mask2> <mask1> <mask2> <mask2> <mask2> <mask2> <mask2> <mask2> Design Critics Choice <mask2> <mask2> <mask2> <mask2> <mask2> <mask2> References <mask2> Design Awards best art'

In [9]:
model_inputs["input_ids"][0]

[0,
 50267,
 50265,
 28039,
 7396,
 8312,
 13,
 1808,
 2698,
 11,
 20,
 4120,
 29,
 1461,
 766,
 21,
 50265,
 7325,
 1714,
 7,
 63,
 595,
 766,
 11,
 1125,
 13,
 50265,
 713,
 464,
 4596,
 31,
 5,
 3292,
 12131,
 6084,
 9,
 5,
 3536,
 9,
 16950,
 5311,
 4455,
 8,
 8841,
 3326,
 510,
 2336,
 23469,
 5,
 7438,
 268,
 6084,
 1773,
 21868,
 2354,
 16,
 1373,
 19,
 5,
 278,
 12489,
 3629,
 85,
 16,
 4241,
 7,
 5,
 275,
 6291,
 1521,
 11,
 10,
 822,
 3541,
 874,
 50266,
 50266,
 50265,
 1990,
 1246,
 50265,
 250,
 7767,
 13,
 2700,
 3292,
 29332,
 16,
 576,
 7,
 822,
 6193,
 96,
 5,
 8204,
 874,
 5,
 1924,
 9,
 5,
 2354,
 13,
 349,
 76,
 16,
 2343,
 50265,
 1409,
 5,
 97,
 11357,
 11,
 34555,
 3569,
 22723,
 50266,
 50266,
 50266,
 50265,
 43545,
 29,
 50265,
 50266,
 50265,
 50266,
 50266,
 50266,
 50266,
 50266,
 50266,
 34091,
 11943,
 14431,
 50266,
 50266,
 50266,
 50266,
 50266,
 50266,
 49379,
 50266,
 34091,
 4229,
 275,
 1808,
 2]

In [16]:
labels["input_ids"][0]

[0,
 133,
 3536,
 3683,
 13,
 2700,
 9850,
 7438,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 21928,
 50269,
 50269,
 50269,
 50269,
 50269,
 19183,
 3292,
 29332,
 53,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 627,
 5663,
 212,
 3536,
 4229,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 9442,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 627,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 133,
 50269,
 50269,
 1322,
 3147,
 19,
 49,
 931,
 76,
 50269,
 50269,
 627,
 3788,
 3536,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 102,
 50269,
 7761,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 9502,
 1432,
 50269,
 50269,
 50269,
 50269,
 50269,
 50269,
 10337,
 158

In [20]:
[-100 if token == tokenizer.get_vocab()[UNMASKED_TOKEN] else token for token in labels["input_ids"][0]]

[0,
 133,
 3536,
 3683,
 13,
 2700,
 9850,
 7438,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 21928,
 -100,
 -100,
 -100,
 -100,
 -100,
 19183,
 3292,
 29332,
 53,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 627,
 5663,
 212,
 3536,
 4229,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 9442,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 627,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 133,
 -100,
 -100,
 1322,
 3147,
 19,
 49,
 931,
 76,
 -100,
 -100,
 627,
 3788,
 3536,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 102,
 -100,
 7761,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 9502,
 1432,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 10337,
 1582,
 462,
 11649,
 -100,
 463,
 11357,
 18283,
 29,
 15891,
 29,
 16863,
 29,
 -100,
 41355,
 29,

In [19]:
tokenizer.pad_token_id

1

In [25]:
labels["input_ids"] = [
                [(l if l != tokenizer.get_vocab()[UNMASKED_TOKEN] else -100) for l in label] for label in labels["input_ids"]
            ]

In [26]:
UNMASKED_TOKEN_ID = tokenizer.get_vocab()[UNMASKED_TOKEN]
labels["input_ids"] = [
                [(l if l != UNMASKED_TOKEN_ID else -100) for l in label] for label in labels["input_ids"]
            ]

In [27]:
labels["input_ids"]

[[0,
  133,
  3536,
  3683,
  13,
  2700,
  9850,
  7438,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  21928,
  -100,
  -100,
  -100,
  -100,
  -100,
  19183,
  3292,
  29332,
  53,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  627,
  5663,
  212,
  3536,
  4229,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  9442,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  627,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  133,
  -100,
  -100,
  1322,
  3147,
  19,
  49,
  931,
  76,
  -100,
  -100,
  627,
  3788,
  3536,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  102,
  -100,
  7761,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  -100,
  9502,
  1432,
  -100,
  -100,
  -100,
  

In [7]:
raw = wiki_datasets_mlm.train_test_split(test_size=0.05)

In [8]:
raw["validation"] = raw["test"]

In [9]:
raw

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text', 'ner', 'summary', 'mlm_label'],
        num_rows: 3579397
    })
    test: Dataset({
        features: ['id', 'url', 'title', 'text', 'ner', 'summary', 'mlm_label'],
        num_rows: 188390
    })
    validation: Dataset({
        features: ['id', 'url', 'title', 'text', 'ner', 'summary', 'mlm_label'],
        num_rows: 188390
    })
})

In [3]:
wiki_datasets_mlm.select(range(3072000, len(wiki_datasets_mlm)))

Dataset({
    features: ['id', 'url', 'title', 'text', 'ner', 'summary', 'mlm_label'],
    num_rows: 695787
})

In [4]:
 len(wiki_datasets_mlm) - 3072000
    

695787

In [6]:
" ".join(["<conn1>", wiki_datasets_mlm[0]["text"]])

'<conn1> <conn1> <mask1> recognizes achievement for art direction in The categorys original name was <mask1> was changed to its current name in 2012 for <mask1> This change resulted from the Art Directors branch of the Academy of Motion Picture Arts and Sciences AMPAS renamed the Designers branch Since 1947 award is shared with the set decorators It is awarded to the best interior design in a film films below <mask2> <mask2> <mask1> for example <mask1> Award for Best Art Direction is given to film 1999 In the lists below the winner of the award for each year is shown <mask1> by the other nominees in alphabetical Winners <mask2> <mask2> <mask2> <mask1> 1950s <mask1> <mask2> <mask1> <mask2> <mask2> <mask2> <mask2> <mask2> <mask2> Design Critics Choice <mask2> <mask2> <mask2> <mask2> <mask2> <mask2> References <mask2> Design Awards best art'