In [1]:
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Optional

import datasets
import nltk  # Here to have a nice missing dependency error message early on
import numpy as np
from datasets import load_dataset, load_metric

import transformers
from filelock import FileLock
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    MBart50Tokenizer,
    MBart50TokenizerFast,
    MBartTokenizer,
    MBartTokenizerFast,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, is_offline_mode, send_example_telemetry
from transformers.utils.versions import require_version


from data import DataCollatorForSeq2SeqWithMultipleReferences
from BSF_Trainer import BSFTrainer
from trainer import CustomTrainer

import traceback


# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
# check_min_version("4.21.0.dev0")

# require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

logger = logging.getLogger(__name__)

try:
    nltk.data.find("tokenizers/punkt")
except (LookupError, OSError):
    if is_offline_mode():
        raise LookupError(
            "Offline mode: run this script without TRANSFORMERS_OFFLINE first to download nltk data files"
        )
    with FileLock(".lock") as lock:
        nltk.download("punkt", quiet=True)

# A list of all multilingual tokenizer which require lang attribute.
MULTILINGUAL_TOKENIZERS = [MBartTokenizer, MBartTokenizerFast, MBart50Tokenizer, MBart50TokenizerFast]

import string
import random


import pandas as pd
import spacy
import random    
import nltk

In [2]:
from datasets import load_from_disk

wiki_datasets = load_from_disk("C:\.cache\huggingface\datasets\wiki_ner_100_1000")

In [9]:
wiki_datasets2 = load_from_disk("G:\.cache\huggingface\datasets\wiki_ner_100_1000")

In [14]:
wiki_datasets

Dataset({
    features: ['id', 'url', 'title', 'text', 'ner'],
    num_rows: 3767787
})

In [16]:
wiki_datasets[3700000]

{'id': '68504122',
 'url': 'https://en.wikipedia.org/wiki/Church%20of%20St.%20Peter%20and%20Paul%2C%20Topolje',
 'title': 'Church of St. Peter and Paul, Topolje',
 'text': 'The Church of St. Peter and Paul (, , ) in Topolje is a Roman Catholic church in Baranya and Podunavlje region in eastern Croatia. The church was built in 1722 at the site where Prince Eugene of Savoy defeated Kara Mustafa Pasha on 12 August 1687. (Note that the year may be inaccurate, as Kara Mustafa was executed shortly after the Battle of Vienna in 1683.) It is the oldest preserved church in Baranja. Today, the church is a cultural monument of the 1st category and is one of the most important baroque sacral buildings in Croatia. The village of Topolje is the seat of the parish of St. Peter and Paul  founded in 1247 and reestablished in 1775, which today is a part of the Roman Catholic Archdiocese of Đakovo-Osijek. The church is known in the region after the mystic story about three failed attempts to build a chur

In [3]:
def pre_process_wiki(num_proc = 1, selections = 100, batch_size = 1000):
    import string
    import random


    import pandas as pd
    import random    
    import nltk
    import math
    
    import evaluate
    import numpy as np
    
    from datasets import load_dataset, load_metric
    data_args = {
        "ner_mlm": False,
        "ner_sgs_mlm": True,
        "ner_mlm_prob": 0.6
    }

    NER_MASK = "<mask1>"
    NER_TOKEN_MASK = "<mask2>"
    MLM_CONNECTOR = "<conn1>"
    MLM_SGS_CONNECTOR = "<conn2>"
    
    text_column = "text"
    summary_column = "summary"

    #bertscore = load_metric('bertscore')

    rouge = evaluate.load("rouge")

        
    def get_other_text(sentence, text):
        res = text.replace(sentence, "", 1)
        if (len(res) == 0 and len(res.split()) == 0):
            res = text
        return res

    def extract_entties_with_spacy(source, spacy_pipeline):
        
        list_entities = []

        spacy_doc = spacy_pipeline(source)#, n_process=-1)
        list_entities = [a.text for a in spacy_doc.ents]

        return list_entities

    
    def mask_sentence(sentence, entities, prob_ner, prob_token):

        tokens_sentence = sentence.translate(str.maketrans('', '', string.punctuation))
        processed = sentence
        tokens_sentnece_list = tokens_sentence.split()

        tokens_sentnece_list_masking = np.zeros(len(tokens_sentnece_list))

        for entity in entities:
            if entity in sentence:
                tokens_sentence = tokens_sentence.replace(entity, "")
                if random.random() <= prob_ner:
                    entity_tokens = entity.split()
                    entity_length = len(entity_tokens)
                    for tokens_index in range(len(tokens_sentnece_list)):
                        tokens_seq = " ".join(tokens_sentnece_list[tokens_index:tokens_index + entity_length])
                        entity_seq = entity
                        if tokens_seq == entity_seq:
                            tokens_sentnece_list_masking[tokens_index:tokens_index + entity_length] = 2

        for token_index in range(len(tokens_sentnece_list)):

            token = tokens_sentnece_list[token_index]

            if tokens_sentnece_list_masking[token_index] == 2:
                # NER MASK
                continue

            if random.random() <= prob_token:
                tokens_sentnece_list_masking[token_index] = 1
            else :
                tokens_sentnece_list_masking[token_index] = 0

        result = []

        for i in range(len(tokens_sentnece_list_masking)):
            if tokens_sentnece_list_masking[i] == 2:
                # if prev is mask2 then do not repeat
                if i > 0 and tokens_sentnece_list_masking[i - 1] == 2:
                    continue

                result.append(NER_MASK)

            elif tokens_sentnece_list_masking[i] == 1:
                # if prev is mask1 then do not repeat
                if i > 0 and tokens_sentnece_list_masking[i - 1] == 1:
                    result.append(NER_TOKEN_MASK)

            else:
                result.append(tokens_sentnece_list[i])

        processed = " ".join(result)

        label = " ".join(np.where(tokens_sentnece_list_masking, tokens_sentnece_list, "<unmasked>"))

        return [processed, label, tokens_sentnece_list_masking]
    
    def sentence_scorer(sentence, text, entities, rouge_score):

        # count entities
        entites_count = 0
        for entity in entities:
            if entity in sentence:
                entites_count += 1
        entites_score = (entites_count * 1.0) / 4

        # bertscore
        #bertscore_results = bertscore.compute(predictions=[sentence], references=[text], lang='en')

#        bertscore_value = np.average(bertscore_results["f1"])

        return entites_score + rouge_score# + bertscore_value

    
    def to_sentences(text):
        sentences = nltk.sent_tokenize(text)
        texts = [get_other_text(sent, text) for sent in sentences]
        
        return (texts, sentences)
    
    def custom_train_preprocess(examples):
    
        #batch_texts = ". ".join(examples[text_column])
        
        
        list_entities = []

        #list_entities = extract_entties_with_spacy(batch_texts, spacy_pipeline)

        all_texts = []
        all_sentences = []
        groups = []
        for i in range(len(examples[text_column])):

            preprocessed_exp = examples[text_column][i]

        
            texts, sentences = to_sentences(preprocessed_exp)
            start_index = len(all_texts)
        #   for tex in texts:
        #        all_texts.append(tex)
        #    for sen in sentences:
        #        all_sentences.append(sen)
            all_texts.extend(texts)
            all_sentences.extend(sentences)
            end_index = len(all_texts)
            groups.append((start_index, end_index, sentences))

        # compute rouge score for the whole batch
    
        result_rouge = rouge.compute(predictions=all_sentences, references=all_texts, rouge_types=["rouge1"], use_aggregator=False)
        rouge_scores = [score for score in result_rouge.get("rouge1")]
        #rouge_scores = [random.random() for sent in all_sentences]
        
            
        documents = []
        summaries = []
        mlm_labels = []
        for i in range(len(examples[text_column])):

            preprocessed_exp = examples[text_column][i]

#            list_entities = extract_entties_with_spacy(preprocessed_exp, spacy_pipeline)
            list_entities = examples["ner"][i]
            
          #  texts, sentences = to_sentences(preprocessed_exp)

            sentences = groups[i][2]
            this_rouge_scores = rouge_scores[groups[i][0]:groups[i][1]]
            
         #   result_rouge = rouge.compute(predictions=sentences, references=texts, use_stemmer=True, use_aggregator=False)
          #  rouge_scores = [score.fmeasure for score in result_rouge.get("rouge1")]
            
            examples_data = {
                "sentence": sentences,
           #     "rouge1": rouge_scores
                "rouge1": this_rouge_scores
            }

            df = pd.DataFrame(examples_data)
            df["score"] = df.apply(lambda x: sentence_scorer(x["sentence"], preprocessed_exp, list_entities, x["rouge1"]), axis=1)
            df.sort_values("score", ascending=False, inplace=True)
            df["normalized_score"] = df["score"].map(lambda x: x / (df["score"].max() + 0.001))
            df["masked_labels"] = df.apply(
                lambda x: 
                    mask_sentence(x["sentence"], list_entities, data_args["ner_mlm_prob"], x["normalized_score"] * (0.1 + data_args["ner_mlm_prob"]))
                          , 
                axis=1)
            df["masked"] = df["masked_labels"].map(lambda x: x[0] )
            df["labels"] = df["masked_labels"].map(lambda x: x[1])
            
            #df["num_masked1"] = df["masked"].map(lambda x: x.count("<mask1>"))
            #df["num_masked2"] = df["masked"].map(lambda x: x.count("<mask2>"))
            #df["length"] = df["sentence"].map(lambda x: len(x.split()))
            #df["length_masked"] = df["masked"].map(lambda x: len(x.split()))
            #df["masked_prob"] = df.apply(lambda x: ((x["num_masked2"] + x["num_masked1"]) + (x["length"] - x["length_masked"])) / x["length"], axis=1)


            #print(df)
            
            if data_args["ner_mlm"]:
                # mlm for faithfull
                summary = " ".join(df["sentence"].sort_index( ))
            elif data_args["ner_sgs_mlm"]:
                # mlm-sgs for faithfull
                num_sentences = df.shape[0] * 0.25
                num_sentences = int(round(num_sentences, 0))
                num_sentences = 1 if num_sentences == 0 else num_sentences
                summary = " ".join(df.head(num_sentences).sort_index( )["sentence"])

            preprocessed_exp = " ".join(df["masked"].sort_index())
            if data_args["ner_mlm"]:
                preprocessed_exp = " ".join([MLM_CONNECTOR, preprocessed_exp])
            elif  data_args["ner_sgs_mlm"]:
                preprocessed_exp = " ".join([MLM_CONNECTOR, MLM_SGS_CONNECTOR, preprocessed_exp])


            #print(df.sort_index())
            #print(df.describe())

            mlm_labels.append(" ".join(df["labels"].sort_index( )))
            summaries.append(summary)
            documents.append(preprocessed_exp)

        new_examples = {}
        new_examples[text_column] = documents
        new_examples[summary_column] = summaries
        new_examples["mlm_label"] = mlm_labels
        return new_examples
    
    return wiki_datasets.select(range(selections)).map(
        custom_train_preprocess,
        batched=True,
        num_proc=num_proc,
        batch_size=batch_size
    )
    

In [40]:
sample = pre_process_wiki(num_proc=1, selections=200, batch_size=100)

In [None]:
sample["text"][3]

In [None]:
sample["mlm_label"][3]

In [None]:
sample["summary"][3]

In [14]:
sample = pre_process_wiki(num_proc=10, selections=20000, batch_size=2000)

Map (num_proc=10):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [15]:
sample = pre_process_wiki(num_proc=14, selections=20000, batch_size=100)

Map (num_proc=14):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [16]:
sample = pre_process_wiki(num_proc=14, selections=20000, batch_size=1000)

Map (num_proc=14):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [9]:
sample = pre_process_wiki(num_proc=14, selections=20000, batch_size=500)

Map (num_proc=14):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [18]:
sample = pre_process_wiki(num_proc=14, selections=40000, batch_size=5000)

Map (num_proc=14):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [48]:
sample = pre_process_wiki(num_proc=14, selections=20000, batch_size=2000)

Map (num_proc=14):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [15]:
sample = pre_process_wiki(num_proc=1, selections=10, batch_size=300)

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [4]:
wiki_dataset_mlm = pre_process_wiki(num_proc=14, selections=wiki_datasets.shape[0], batch_size=300)

Map (num_proc=14):   0%|          | 0/3767787 [00:00<?, ? examples/s]

In [5]:
wiki_dataset_mlm.save_to_disk("G:\.cache\huggingface\datasets\mlm\wiki_mlm")

Saving the dataset (0/62 shards):   0%|          | 0/3767787 [00:00<?, ? examples/s]

In [11]:
wiki_datasets

Dataset({
    features: ['id', 'url', 'title', 'text', 'ner'],
    num_rows: 3767787
})

In [12]:
wiki_dataset_mlm_sgs = pre_process_wiki(num_proc=14, selections=2000, batch_size=300)

Map (num_proc=14):   0%|          | 0/2000 [00:00<?, ? examples/s]

In [15]:
wiki_dataset_mlm_sgs[5]

{'id': '580',
 'url': 'https://en.wikipedia.org/wiki/Astronomer',
 'title': 'Astronomer',
 'text': '<conn1> <conn2> astronomer is a scientist in the field of astronomy who focuses studies on a specific question or field outside the scope <mask1> They observe astronomical objects such stars planets moons comets and galaxies – in either observational by analyzing the data or theoretical astronomy Examples of topics or fields astronomers study include planetary science solar the origin or evolution of stars or the formation of galaxies A related but distinct is physical cosmology which studies the Universe as a whole Types Astronomers fall under either <mask1> main types observational and theoretical Observational astronomers make direct observations of celestial objects and analyze the data In contrast theoretical astronomers create and investigate models of things that cannot be observed Because it takes <mask1> of <mask2> a of stars or a galaxy to complete a life cycle astronomers must

In [69]:
len(nltk.sent_tokenize(wiki_dataset_mlm_sgs[9]["text"]))

11

In [68]:
len(nltk.sent_tokenize(wiki_dataset_mlm_sgs[9]["summary"]))

3

In [None]:
wiki_dataset_mlm_sgs = pre_process_wiki(num_proc=14, selections=wiki_datasets.shape[0], batch_size=300)

Map (num_proc=14):   0%|          | 0/3767787 [00:00<?, ? examples/s]

In [11]:
wiki_dataset_mlm_sgs.save_to_disk("G:\.cache\huggingface\datasets\mlm\wiki_mlm_sgs")

Saving the dataset (0/52 shards):   0%|          | 0/3767787 [00:00<?, ? examples/s]

In [73]:
wiki_dataset_mlm

Dataset({
    features: ['id', 'url', 'title', 'text', 'ner', 'summary'],
    num_rows: 3767787
})

In [7]:
wiki_dataset_mlm_sgs[0]["text"]

'<conn1> <conn2> <mask1> recognizes for art direction in film The <mask2> name <mask1> but was to its current name in <mask1> for the 85th Awards This change resulted from the Art Directors of <mask1> AMPAS renamed Designers Since <mask1> the award is shared with the set decorators It is awarded to the best interior design in a film The films are listed with <mask1> for example the Academy Award <mask1> given to a film <mask1> the lists below the winner of the award for <mask1> is shown first followed by the other nominees alphabetical order Winners and <mask1> 1940s <mask1> 2000s <mask1> <mask2> <mask2> <mask2> <mask2> <mask2> <mask2> <mask2> Design Choice <mask2> <mask2> <mask2> <mask2> <mask2> Notes <mask2> <mask2> <mask2> <mask2> <mask2> <mask2> <mask2> <mask2>'

In [10]:
wiki_dataset_mlm_sgs[3767780]["mlm_label"]

'Rhodactis inchoata <unmasked> <unmasked> as <unmasked> bullseye <unmasked> or Tonga <unmasked> <unmasked> <unmasked> <unmasked> species <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> Rhodactis <unmasked> <unmasked> is also <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> as <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> species <unmasked> Rhodactis <unmasked> <unmasked> <unmasked> <unmasked> be <unmasked> <unmasked> aquariums <unmasked> <unmasked> prefers <unmasked> <unmasked> medium <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> kept <unmasked> <unmasked> <unmasked> low <unmasked> <unmasked> <unmasked> movement <unmasked> <unmasked> this species <unmasked> <unmasked> <unmasked> as <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <un

In [75]:
wiki_datasets1 = load_dataset(
    "wikipedia",
    "20220301.en",
     cache_dir="G:\.cache\huggingface\datasets"
)

In [76]:
wiki_datasets1

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 6458670
    })
})

In [79]:
from datasets import DatasetDict

In [81]:
DatasetDict({"train": wiki_dataset_mlm})

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text', 'ner', 'summary'],
        num_rows: 3767787
    })
})

In [141]:
import string
import numpy as np
import random

NER_MASK = "<mask1>"
NER_TOKEN_MASK = "<mask2>"
MLM_CONNECTOR = "<conn1>"
MLM_SGS_CONNECTOR = "<conn2>"
    
def mask_sentence(sentence, entities, prob_ner, prob_token):

    
    tokens_sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    processed = sentence
    tokens_sentnece_list = tokens_sentence.split()

    tokens_sentnece_list_masking = np.zeros(len(tokens_sentnece_list))

    for entity in entities:
        if entity in sentence:
            tokens_sentence = tokens_sentence.replace(entity, "")
            if random.random() <= prob_ner:
                entity_tokens = entity.split()
                entity_length = len(entity_tokens)
                for tokens_index in range(len(tokens_sentnece_list)):
                    tokens_seq = " ".join(tokens_sentnece_list[tokens_index:tokens_index + entity_length])
                    entity_seq = entity
                  #  print("entity_seq", entity)
                  #  print("tokens_seq", tokens_seq)
                    if tokens_seq == entity_seq:
                        tokens_sentnece_list_masking[tokens_index:tokens_index + entity_length] = 2
                        
                #processed = sentence.replace(entity, NER_MASK)

    for token_index in range(len(tokens_sentnece_list)):
        
        token = tokens_sentnece_list[token_index]
        
        if tokens_sentnece_list_masking[token_index] == 2:
            # NER MASK
            continue
            
        if random.random() <= prob_token:
            tokens_sentnece_list_masking[token_index] = 1
            #processed = processed.replace(" " + token, " " + NER_TOKEN_MASK, 1)
            #processed = processed.replace(" " + token + " ", " " + NER_TOKEN_MASK + " ", 1)
            #processed = processed.replace(token + " ", NER_TOKEN_MASK + " ", 1)
        else :
            tokens_sentnece_list_masking[token_index] = 0

    result = []
    
    for i in range(len(tokens_sentnece_list_masking)):
        if tokens_sentnece_list_masking[i] == 2:
            # if prev is mask2 then do not repeat
            if i > 0 and tokens_sentnece_list_masking[i - 1] == 2:
                continue
                
            result.append(NER_MASK)
            
        elif tokens_sentnece_list_masking[i] == 1:
            # if prev is mask1 then do not repeat
            if True or i > 0 and tokens_sentnece_list_masking[i - 1] == 1:
                result.append(NER_TOKEN_MASK)
            
        else:
            result.append(tokens_sentnece_list[i])
        
    processed = " ".join(result)
    
    label = " ".join(np.where(tokens_sentnece_list_masking, tokens_sentnece_list, "<unmasked>"))
    
    return processed, " ".join(np.where(tokens_sentnece_list_masking, tokens_sentnece_list, "<unmasked>")), tokens_sentnece_list_masking

In [84]:
sentence = "some text sentence"
masked, label, mask = mask_sentence(sentence, [], 0.5, 0.5)
print(masks)
print(masked)

some text <unmasked>
<mask2> sentence


In [32]:
np.where(masks, tokens, "unmasked")

array(['unmasked', 'unmasked', 'sentence'], dtype='<U8')

In [35]:
from datasets import load_from_disk
wiki_dataset = load_from_disk("G:\.cache\huggingface\datasets\wiki_100_1000")

In [142]:
masked, label, mask = mask_sentence(wiki_dataset["train"][0]["text"], [
    "The Academy Award for Best Production Design",
    "Motion Picture Arts and Sciences (AMPAS)",
    "Designer's ",
    "1999"
], 1, 0.3)

In [143]:
masked

'<mask1> recognizes achievement <mask2> art <mask2> in film <mask2> categorys original name <mask2> Best Art Direction but was changed to its current name in 2012 for the <mask2> Academy <mask2> This <mask2> <mask2> from the <mask2> <mask2> branch of the <mask2> of Motion <mask2> Arts and <mask2> AMPAS <mask2> <mask2> the Designers branch Since 1947 the award <mask2> <mask2> <mask2> the <mask2> decorators It is awarded to the best <mask2> design <mask2> <mask2> film The <mask2> below are listed with their <mask2> <mask2> for example the 2000 Academy Award for Best Art Direction is given <mask2> <mask2> <mask2> from <mask1> <mask2> the lists below the <mask2> <mask2> the <mask2> for <mask2> year is shown first followed by the other nominees in alphabetical order Superlatives <mask2> and <mask2> <mask2> 1930s <mask2> 1950s 1960s 1970s <mask2> 1990s 2000s 2010s <mask2> See also <mask2> <mask2> for Best Production Design <mask2> <mask2> <mask2> Award <mask2> Best Production Design Notes Re

In [144]:
label

'The Academy Award for Best Production Design <unmasked> <unmasked> for <unmasked> direction <unmasked> <unmasked> The <unmasked> <unmasked> <unmasked> was <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> 85th <unmasked> Awards <unmasked> change resulted <unmasked> <unmasked> Art Directors <unmasked> <unmasked> <unmasked> Academy <unmasked> <unmasked> Picture <unmasked> <unmasked> Sciences <unmasked> being renamed <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> is shared with <unmasked> set <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> interior <unmasked> in a <unmasked> <unmasked> films <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> production year <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> <unmasked> to a film <unmasked> 1999 In <unmaske

In [145]:
wiki_dataset["train"][0]["text"]

"The Academy Award for Best Production Design recognizes achievement for art direction in film. The category's original name was Best Art Direction, but was changed to its current name in 2012 for the 85th Academy Awards. This change resulted from the Art Director's branch of the Academy of Motion Picture Arts and Sciences (AMPAS) being renamed the Designer's branch. Since 1947, the award is shared with the set decorator(s). It is awarded to the best interior design in a film.\n\nThe films below are listed with their production year (for example, the 2000 Academy Award for Best Art Direction is given to a film from 1999). In the lists below, the winner of the award for each year is shown first, followed by the other nominees in alphabetical order.\n\nSuperlatives\n\nWinners and nominees\n\n1920s\n\n1930s\n\n1940s\n\n1950s\n\n1960s\n\n1970s\n\n1980s\n\n1990s\n\n2000s\n\n2010s\n\n2020s\n\nSee also\n BAFTA Award for Best Production Design\n Critics' Choice Movie Award for Best Production 

In [102]:
mask

array([2., 2., 2., 2., 2., 2., 2., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 1., 0., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 1., 0., 1., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 1.,
       0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 0.,
       0., 0., 1., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0.,
       0., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 1.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 1.])

In [None]:
  def mask_sentence(sentence, entities, prob_ner, prob_token):

        tokens_sentence = sentence.translate(str.maketrans('', '', string.punctuation))
        processed = sentence
        tokens_sentnece_list = tokens_sentence.split()
        
        tokens_sentnece_list_masking = np.zeros()


        for entity in entities:
            if entity in sentence:
                tokens_sentence = tokens_sentence.replace(entity, "")
                if random.random() <= prob_ner:
                    processed = sentence.replace(entity, NER_MASK)

        for token in tokens_sentnece_list:
            if token == NER_MASK:
                continue
            if random.random() <= prob_token:
                processed = processed.replace(" " + token, " " + NER_TOKEN_MASK, 1)
                processed = processed.replace(" " + token + " ", " " + NER_TOKEN_MASK + " ", 1)
                processed = processed.replace(token + " ", NER_TOKEN_MASK + " ", 1)

        for i in range(0, len(tokens_sentnece_list)):
            multiple_masks = NER_TOKEN_MASK
            for j in range (0, i):
                multiple_masks += " " + NER_TOKEN_MASK
            processed = processed.replace(multiple_masks, NER_TOKEN_MASK)

        return processed
  