In [1]:
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Optional

import datasets
import nltk  # Here to have a nice missing dependency error message early on
import numpy as np
from datasets import load_dataset, load_metric

import transformers
from filelock import FileLock
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    MBart50Tokenizer,
    MBart50TokenizerFast,
    MBartTokenizer,
    MBartTokenizerFast,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, is_offline_mode, send_example_telemetry
from transformers.utils.versions import require_version


from data import DataCollatorForSeq2SeqWithMultipleReferences
from BSF_Trainer import BSFTrainer
from trainer import CustomTrainer

import traceback


# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
# check_min_version("4.21.0.dev0")

# require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

logger = logging.getLogger(__name__)

try:
    nltk.data.find("tokenizers/punkt")
except (LookupError, OSError):
    if is_offline_mode():
        raise LookupError(
            "Offline mode: run this script without TRANSFORMERS_OFFLINE first to download nltk data files"
        )
    with FileLock(".lock") as lock:
        nltk.download("punkt", quiet=True)

# A list of all multilingual tokenizer which require lang attribute.
MULTILINGUAL_TOKENIZERS = [MBartTokenizer, MBartTokenizerFast, MBart50Tokenizer, MBart50TokenizerFast]

import string
import random


import pandas as pd
import spacy
import random    
import nltk

In [2]:
data_args = {
    "ner_mlm": False,
    "ner_sgs_mlm": True,
    "ner_mlm_prob": 0.6
}

NER_MASK = "<mask1>"
NER_TOKEN_MASK = "<mask2>"
MLM_CONNECTOR = "<conn1>"
MLM_SGS_CONNECTOR = "<conn2>"

rouge = load_metric("rouge")
bertscore = load_metric('bertscore')

try:
    spacy_pipeline = spacy.load('en_core_web_sm')
except OSError:
    logging.warning("Downloading language model for the spaCy model.")
    from spacy.cli import download
    download('en_core_web_sm')
    spacy_pipeline = spacy.load('en_core_web_sm')

  rouge = load_metric("rouge")


In [3]:
import spacy
def extract_entties_with_spacy(source):
    try:
        spacy_pipeline = spacy.load('en_core_web_sm')
    except OSError:
        logging.warning("Downloading language model for the spaCy model.")
        from spacy.cli import download
        download('en_core_web_sm')
        spacy_pipeline = spacy.load('en_core_web_sm')

    list_entities = []

    spacy_doc = spacy_pipeline(source)
    list_entities = [a.text for a in spacy_doc.ents]
    
    return list_entities


In [4]:
def sentence_scorer(sentence, text, entities):
    text_else = text.replace(sentence, "", 1)
    
    # count entities
    entites_count = 0
    for entity in entities:
        if entity in sentence:
            entites_count += 1
    entites_score = (entites_count * 1.0) / 4

    # rouge score
    result_rouge = rouge.compute(predictions=[sentence], references=[text], use_stemmer=True)
    rouge_score = result_rouge.get("rouge1").mid.fmeasure

    # bertscore
    bertscore_results = bertscore.compute(predictions=[sentence], references=[text], lang='en')
    
    bertscore_value = np.average(bertscore_results["f1"])
    
    return entites_score + rouge_score + bertscore_value

def mask_sentence(sentence, entities, prob_ner, prob_token):

    tokens_sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    processed = sentence
    
    
    for entity in entities:
        if entity in sentence:
            tokens_sentence = tokens_sentence.replace(entity, "")
            if random.random() <= prob_ner:
                processed = sentence.replace(entity, NER_MASK)
                
    for token in tokens_sentence.split():
        if token == NER_MASK:
            continue
        if random.random() <= prob_token:
            processed = processed.replace(" " + token, " " + NER_TOKEN_MASK, 1)
            processed = processed.replace(" " + token + " ", " " + NER_TOKEN_MASK + " ", 1)
            processed = processed.replace(token + " ", NER_TOKEN_MASK + " ", 1)

    for i in range(0, len(tokens_sentence.split())):
        multiple_masks = NER_TOKEN_MASK
        for j in range (0, i):
            multiple_masks += " " + NER_TOKEN_MASK
        processed = processed.replace(multiple_masks, NER_TOKEN_MASK)
    
    return processed
    
def custom_train_preprocess(examples):
    
  #  if (not data_args["ner_mlm"]) and (not data_args["ner_sgs_mlm"]):
   #     return examples

    


    list_entities = []
    

    documents = []
    summaries = []
    for i in range(len(examples[text_column])):

        preprocessed_exp = examples[text_column][i]

        list_entities = extract_entties_with_spacy(preprocessed_exp)
        
        sentences = nltk.sent_tokenize(preprocessed_exp)

        examples_data = {
            "sentence": sentences,
        }
        
        df = pd.DataFrame(examples_data)
        df["score"] = df["sentence"].map(lambda x: sentence_scorer(x, preprocessed_exp, list_entities))
        df.sort_values("score", ascending=False, inplace=True)
        df["normalized_score"] = df["score"].map(lambda x: x / df["score"].max())
        df["masked"] = df.apply(lambda x: mask_sentence(x["sentence"], list_entities, data_args["ner_mlm_prob"], x["normalized_score"] * data_args["ner_mlm_prob"]), axis=1)
        df["num_masked1"] = df["masked"].map(lambda x: x.count("<mask1>"))
        df["num_masked2"] = df["masked"].map(lambda x: x.count("<mask2>"))
        df["length"] = df["sentence"].map(lambda x: len(x.split()))
        df["length_masked"] = df["masked"].map(lambda x: len(x.split()))
        df["masked_prob"] = df.apply(lambda x: ((x["num_masked2"] + x["num_masked1"]) + (x["length"] - x["length_masked"])) / x["length"], axis=1)
        

        if data_args["ner_mlm"]:
            # mlm for faithfull
            summary = " ".join(df["sentence"].sort_index( ))
        elif data_args["ner_sgs_mlm"]:
            # mlm-sgs for faithfull
            summary = " ".join(df.head(int(df.shape[0] / 3.0)).sort_index( )["sentence"])
            
        preprocessed_exp = " ".join(df["masked"].sort_index())
        if data_args["ner_mlm"]:
            preprocessed_exp = " ".join([MLM_CONNECTOR, preprocessed_exp])
        elif  data_args["ner_sgs_mlm"]:
            preprocessed_exp = " ".join([MLM_CONNECTOR, MLM_SGS_CONNECTOR, preprocessed_exp])
        
                
       # print(df.sort_index())
       # print(df.describe())
            
        summaries.append(summary)
        documents.append(preprocessed_exp)
        
    new_examples = {}
    new_examples[text_column] = documents
    new_examples[summary_column] = summaries
    return new_examples

In [2]:
text_column = "text"
summary_column = "summary"

from datasets import load_from_disk

wiki_datasets = load_from_disk("C:\.cache\huggingface\datasets\wiki_100_1000")

In [2]:
sample = wiki_datasets["train"].select(range(10)).map(
    custom_train_preprocess,
    batched=True,
    num_proc=1,#data_args.preprocessing_num_workers,
)

NameError: name 'wiki_datasets' is not defined

In [9]:
sentence = "Hello How are you"
text = "I am fine you?"

In [14]:
result_rouge = rouge.compute(predictions=[sentence], references=[text], use_stemmer=True, use_aggregator=False)
rouge_score = result_rouge.get("rouge1")

In [57]:
rouge_score[0].fmeasure

0.25

In [18]:
wiki_datasets["train"]["text"][0]

"The Academy Award for Best Production Design recognizes achievement for art direction in film. The category's original name was Best Art Direction, but was changed to its current name in 2012 for the 85th Academy Awards. This change resulted from the Art Director's branch of the Academy of Motion Picture Arts and Sciences (AMPAS) being renamed the Designer's branch. Since 1947, the award is shared with the set decorator(s). It is awarded to the best interior design in a film.\n\nThe films below are listed with their production year (for example, the 2000 Academy Award for Best Art Direction is given to a film from 1999). In the lists below, the winner of the award for each year is shown first, followed by the other nominees in alphabetical order.\n\nSuperlatives\n\nWinners and nominees\n\n1920s\n\n1930s\n\n1940s\n\n1950s\n\n1960s\n\n1970s\n\n1980s\n\n1990s\n\n2000s\n\n2010s\n\n2020s\n\nSee also\n BAFTA Award for Best Production Design\n Critics' Choice Movie Award for Best Production 

In [None]:
sample = wiki_datasets["train"].select(range(10)).map(
    custom_train_preprocess,
    batched=True,
    num_proc=2,#data_args.preprocessing_num_workers,
)

In [43]:
import nltk
def get_other_text(sentence, text):
    res = text.replace(sentence, "", 1)
    if (len(res) == 0 and len(res.split()) == 0):
        res = text
    return res
def to_sentences(text):
    sentences = nltk.sent_tokenize(text)
    texts = [get_other_text(sent, text) for sent in sentences]

    return {
        "text": texts,
        "sentence": sentences
    }
def preprocess(examples):
    sentences = []
    texts = []
    for example in examples["text"]:
        res = to_sentences(example)
        sentences.extend(res["sentence"])
        texts.extend(res["text"])
    return {
        "text": texts,
        "sentence": sentences
    }

In [53]:
sample = wiki_datasets["train"].select(range(1000)).map(
    preprocess,
    batched=True,
    remove_columns=wiki_datasets["train"].column_names,
    num_proc=1,#data_args.preprocessing_num_workers,
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [77]:
def pre_process_score(examples):
    #result_rouge = rouge.compute(predictions=examples["sentence"], references=examples["text"], use_stemmer=True, use_aggregator=False)
    #rouge_score = [score.fmeasure for score in result_rouge.get("rouge1")]
    bertscore_results = bertscore.compute(predictions=examples["sentence"], references=examples["text"], lang='en')
    bert_scores = bertscore_results["f1"]
    
    #examples["rouge"] = rouge_score
    examples["bert_score"] = bert_scores
    return examples

def pre_process__rouge_score(examples):
    result_rouge = rouge.compute(predictions=examples["sentence"], references=examples["text"], use_stemmer=True, use_aggregator=False)
    rouge_score = [score.fmeasure for score in result_rouge.get("rouge1")]
    #bertscore_results = bertscore.compute(predictions=examples["sentence"], references=examples["text"], lang='en')
    #bert_scores = bertscore_results["f1"]
    
    examples["rouge"] = rouge_score
    #examples["bert_score"] = bert_scores
    return examples

In [83]:
sample_bert = sample.select(range(20221)).map(
    pre_process_score,
    batched=True,
    num_proc=1,#data_args.preprocessing_num_workers,
    batch_size=10000
)

Map:   0%|          | 0/20221 [00:00<?, ? examples/s]

In [80]:
sample_rouge = sample.select(range(1000)).map(
    pre_process__rouge_score,
    batched=True,
    num_proc=1,#data_args.preprocessing_num_workers,
    batch_size=10000
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [82]:
sample

Dataset({
    features: ['text', 'sentence'],
    num_rows: 20221
})

In [None]:
def bert_map():
    import string
    import random


    import pandas as pd
    import spacy
    import random    
    import nltk
    import math
    from datasets import load_dataset, load_metric
    import multiprocessing 
    import os
    from multiprocessing import Process, Manager, Value
    
    
    rouge = load_metric("rouge")
    def pre_process_score(examples):
        #lock = multiprocessing.Lock()
        os.environ["CUDA_VISIBLE_DEVICES"]="2" #"{0}".format( 0 if random.random()  <= 0.5 else 1)
        
        #os.environ["CUDA_VISIBLE_DEVICES"]= "{0}".format( 0 if random.random()  <= 0.5 else 1)
        bertscore = load_metric('bertscore')
        #result_rouge = rouge.compute(predictions=examples["sentence"], references=examples["text"], use_stemmer=True, use_aggregator=False)
        #rouge_score = [score.fmeasure for score in result_rouge.get("rouge1")]
        bertscore_results = bertscore.compute(predictions=examples["sentence"], references=examples["text"], lang='en')
        bert_scores = bertscore_results["f1"]

        #examples["rouge"] = rouge_score
        examples["bert_score"] = bert_scores
        return examples
    
    sample_bert = sample.select(range(20221)).map(
        lambda x: pre_process_score(x),
        batched=True,
        num_proc=10,#data_args.preprocessing_num_workers,
        batch_size=500
    )

bert_map()

Map (num_proc=10):   0%|          | 0/20221 [00:00<?, ? examples/s]

In [145]:
sample

Dataset({
    features: ['text', 'sentence'],
    num_rows: 20221
})

In [12]:
def pre_process_wiki(num_proc = 1, selections = 100, batch_size = 1000):
    import string
    import random


    import pandas as pd
    import spacy
    import random    
    import nltk
    
    import evaluate
    
    from datasets import load_dataset, load_metric
    data_args = {
        "ner_mlm": False,
        "ner_sgs_mlm": True,
        "ner_mlm_prob": 0.3
    }

    NER_MASK = "<mask1>"
    NER_TOKEN_MASK = "<mask2>"
    MLM_CONNECTOR = "<conn1>"
    MLM_SGS_CONNECTOR = "<conn2>"
    
    text_column = "text"
    summary_column = "summary"

    #bertscore = load_metric('bertscore')

    rouge = evaluate.load("rouge")
    
    try:
        spacy_pipeline = spacy.load('en_core_web_sm', enable=["ner"])
    except OSError:
        logging.warning("Downloading language model for the spaCy model.")
        from spacy.cli import download
        download('en_core_web_sm')
        spacy_pipeline = spacy.load('en_core_web_sm')

        
    def get_other_text(sentence, text):
        res = text.replace(sentence, "", 1)
        if (len(res) == 0 and len(res.split()) == 0):
            res = text
        return res

    def extract_entties_with_spacy(source, spacy_pipeline):
        
        list_entities = []

        spacy_doc = spacy_pipeline(source)#, n_process=-1)
        list_entities = [a.text for a in spacy_doc.ents]

        return list_entities

    
    def sentence_scorer(sentence, text, entities, rouge_score):

        # count entities
        entites_count = 0
        for entity in entities:
            if entity in sentence:
                entites_count += 1
        entites_score = (entites_count * 1.0) / 4

        # bertscore
        #bertscore_results = bertscore.compute(predictions=[sentence], references=[text], lang='en')

#        bertscore_value = np.average(bertscore_results["f1"])

        return entites_score + rouge_score# + bertscore_value

    def mask_sentence(sentence, entities, prob_ner, prob_token):

        tokens_sentence = sentence.translate(str.maketrans('', '', string.punctuation))
        processed = sentence


        for entity in entities:
            if entity in sentence:
                tokens_sentence = tokens_sentence.replace(entity, "")
                if random.random() <= prob_ner:
                    processed = sentence.replace(entity, NER_MASK)

        tokens_sentnece_list = tokens_sentence.split()
        for token in tokens_sentnece_list:
            if token == NER_MASK:
                continue
            if random.random() <= prob_token:
                processed = processed.replace(" " + token, " " + NER_TOKEN_MASK, 1)
                processed = processed.replace(" " + token + " ", " " + NER_TOKEN_MASK + " ", 1)
                processed = processed.replace(token + " ", NER_TOKEN_MASK + " ", 1)

        for i in range(0, len(tokens_sentnece_list)):
            multiple_masks = NER_TOKEN_MASK
            for j in range (0, i):
                multiple_masks += " " + NER_TOKEN_MASK
            processed = processed.replace(multiple_masks, NER_TOKEN_MASK)

        return processed
    
    def to_sentences(text):
        sentences = nltk.sent_tokenize(text)
        texts = [get_other_text(sent, text) for sent in sentences]
        
        return (texts, sentences)
    
    def custom_train_preprocess(examples):
    
        #batch_texts = ". ".join(examples[text_column])
        
        
        list_entities = []

        #list_entities = extract_entties_with_spacy(batch_texts, spacy_pipeline)

        all_texts = []
        all_sentences = []
        groups = []
        for i in range(len(examples[text_column])):

            preprocessed_exp = examples[text_column][i]

        
            texts, sentences = to_sentences(preprocessed_exp)
            start_index = len(all_texts)
            for tex in texts:
                all_texts.append(tex)
            for sen in sentences:
                all_sentences.append(sen)
#            all_texts.extend(texts)
 #           all_sentences.extend(sentences)
            end_index = len(all_texts)
            groups.append((start_index, end_index, sentences))

        # compute rouge score for the whole batch
    
        result_rouge = rouge.compute(predictions=all_sentences, references=all_texts, rouge_types=["rouge1"], use_aggregator=False)
        rouge_scores = [score for score in result_rouge.get("rouge1")]
        #rouge_scores = [random.random() for sent in all_sentences]
        
            
        documents = []
        summaries = []
        for i in range(len(examples[text_column])):

            preprocessed_exp = examples[text_column][i]

            list_entities = extract_entties_with_spacy(preprocessed_exp, spacy_pipeline)
            
            
          #  texts, sentences = to_sentences(preprocessed_exp)

            sentences = groups[i][2]
            this_rouge_scores = rouge_scores[groups[i][0]:groups[i][1]]
            
         #   result_rouge = rouge.compute(predictions=sentences, references=texts, use_stemmer=True, use_aggregator=False)
          #  rouge_scores = [score.fmeasure for score in result_rouge.get("rouge1")]
            
            examples_data = {
                "sentence": sentences,
           #     "rouge1": rouge_scores
                "rouge1": this_rouge_scores
            }

            df = pd.DataFrame(examples_data)
            df["score"] = df.apply(lambda x: sentence_scorer(x["sentence"], preprocessed_exp, list_entities, x["rouge1"]), axis=1)
            df.sort_values("score", ascending=False, inplace=True)
            df["normalized_score"] = df["score"].map(lambda x: x / (df["score"].max() + 0.001))
            df["masked"] = df.apply(lambda x: mask_sentence(x["sentence"], list_entities, data_args["ner_mlm_prob"], x["normalized_score"] * data_args["ner_mlm_prob"]), axis=1)
            #df["num_masked1"] = df["masked"].map(lambda x: x.count("<mask1>"))
            #df["num_masked2"] = df["masked"].map(lambda x: x.count("<mask2>"))
            #df["length"] = df["sentence"].map(lambda x: len(x.split()))
            #df["length_masked"] = df["masked"].map(lambda x: len(x.split()))
            #df["masked_prob"] = df.apply(lambda x: ((x["num_masked2"] + x["num_masked1"]) + (x["length"] - x["length_masked"])) / x["length"], axis=1)


            #print(df)
            
            if data_args["ner_mlm"]:
                # mlm for faithfull
                summary = " ".join(df["sentence"].sort_index( ))
            elif data_args["ner_sgs_mlm"]:
                # mlm-sgs for faithfull
                summary = " ".join(df.head(int(df.shape[0] / 3.0)).sort_index( )["sentence"])

            preprocessed_exp = " ".join(df["masked"].sort_index())
            if data_args["ner_mlm"]:
                preprocessed_exp = " ".join([MLM_CONNECTOR, preprocessed_exp])
            elif  data_args["ner_sgs_mlm"]:
                preprocessed_exp = " ".join([MLM_CONNECTOR, MLM_SGS_CONNECTOR, preprocessed_exp])


           # print(df.sort_index())
           # print(df.describe())

            summaries.append(summary)
            documents.append(preprocessed_exp)

        new_examples = {}
        new_examples[text_column] = documents
        new_examples[summary_column] = summaries
        return new_examples
    
    return wiki_datasets["train"].select(range(selections)).map(
        custom_train_preprocess,
        batched=True,
        num_proc=num_proc,
        batch_size=batch_size
    )
    

In [31]:
sample = pre_process_wiki(num_proc=10, selections=20000)

Map (num_proc=10):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [9]:
pre_process_wiki(num_proc=16, selections=20000)

Map (num_proc=16):   0%|          | 0/20000 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'url', 'title', 'text', 'summary'],
    num_rows: 20000
})

In [11]:
sampple2000 = pre_process_wiki(num_proc=24, selections=2000)

Map (num_proc=24):   0%|          | 0/2000 [00:00<?, ? examples/s]

In [22]:
sampple2000 = pre_process_wiki(num_proc=24, selections=2000, batch_size=50)

Map (num_proc=24):   0%|          | 0/2000 [00:00<?, ? examples/s]

In [24]:
sampple2000 = pre_process_wiki(num_proc=24, selections=2000, batch_size=50)

Map (num_proc=24):   0%|          | 0/2000 [00:00<?, ? examples/s]

In [10]:
rouge = evaluate.load("rouge")

In [33]:
sampple2000 = pre_process_wiki(num_proc=20, selections=2000, batch_size=100)

Map (num_proc=20):   0%|          | 0/2000 [00:00<?, ? examples/s]

In [26]:
sampple2000 = pre_process_wiki(num_proc=2, selections=2000, batch_size=1000)

Map (num_proc=2):   0%|          | 0/2000 [00:00<?, ? examples/s]

In [37]:
sampple2000 = pre_process_wiki(num_proc=8, selections=4000, batch_size=500)

Map (num_proc=8):   0%|          | 0/4000 [00:00<?, ? examples/s]

In [38]:
sampple2000 = pre_process_wiki(num_proc=16, selections=4000, batch_size=250)

Map (num_proc=16):   0%|          | 0/4000 [00:00<?, ? examples/s]

In [44]:
sampple2000 = pre_process_wiki(num_proc=4, selections=4000, batch_size=1000)

Map (num_proc=4):   0%|          | 0/4000 [00:00<?, ? examples/s]

In [47]:
sampple2000 = pre_process_wiki(num_proc=8, selections=4000, batch_size=500)

Map (num_proc=8):   0%|          | 0/4000 [00:00<?, ? examples/s]

In [53]:
sampple2000 = pre_process_wiki(num_proc=16, selections=4000, batch_size=250)

Map (num_proc=16):   0%|          | 0/4000 [00:00<?, ? examples/s]

In [59]:
sampple2000 = pre_process_wiki(num_proc=16, selections=4000, batch_size=250)

Map (num_proc=16):   0%|          | 0/4000 [00:00<?, ? examples/s]

In [61]:
sampple2000 = pre_process_wiki(num_proc=16, selections=4000, batch_size=250)

Map (num_proc=16):   0%|          | 0/4000 [00:00<?, ? examples/s]

In [63]:
sampple2000 = pre_process_wiki(num_proc=16, selections=4000, batch_size=250)

Map (num_proc=16):   0%|          | 0/4000 [00:00<?, ? examples/s]

In [13]:
sampple2000 = pre_process_wiki(num_proc=16, selections=4000, batch_size=250)

In [15]:
sampple2000 = pre_process_wiki(num_proc=10, selections=20000, batch_size=2000)

Map (num_proc=10):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [16]:
wiki_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'url', 'title', 'text'],
        num_rows: 3767787
    })
})

In [17]:
def gpu_spacy_process(num_proc = 1, ranges = range(100), batch_size = 1000):
    
    from thinc.api import set_gpu_allocator, require_gpu

    import spacy
    def process(examples):

        require_gpu(0)
        # Use the GPU, with memory allocations directed via PyTorch.
        # This prevents out-of-memory errors that would otherwise occur from competing
        # memory pools.
        #set_gpu_allocator("pytorch")
        #require_gpu(1 if random.random() < 0.3 else 0)
        try:
            spacy_pipeline = spacy.load('en_core_web_sm', enable=["ner"])
        except OSError:
            logging.warning("Downloading language model for the spaCy model.")
            from spacy.cli import download
            download('en_core_web_sm')
            spacy_pipeline = spacy.load('en_core_web_sm')
            
        texts = examples["text"]
        list_entities = []

        spacy_docs = spacy_pipeline.pipe(texts)
        for doc in spacy_docs:
            list_entities.append([a.text for a in doc.ents])

        examples["ner"] = list_entities
        return examples

    return wiki_datasets["train"].select(ranges).map(
        process,
        batched=True,
        num_proc=num_proc,
        batch_size=batch_size
    )
#gpu_spacy_process(2, 20000, 1000)

In [11]:
sample_ner = gpu_spacy_process(8, 20000, 1000)

Map (num_proc=8):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [14]:
sample_ner = gpu_spacy_process(8, 20000, 1000)

Map (num_proc=8):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [16]:
sample_ner = gpu_spacy_process(20, 20000, 1000)

Map (num_proc=8):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [17]:
sample_ner = gpu_spacy_process(20, 20000, 1000)

Map (num_proc=20):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [None]:
for i in range(0, 20):
    step_size = 10
    start_index = i*step_size
    end_index = (i+1)*step_size
    if end_index > wiki_datasets["train"].shape[0]:
        end_index =  wiki_datasets["train"].shape[0]
    r = range(start_index, end_index)
    sample_ner = gpu_spacy_process(20, r, 3000)

num_proc must be <= 10. Reducing num_proc to 10 for dataset of size 10.


Map (num_proc=10):   0%|          | 0/10 [00:00<?, ? examples/s]

In [9]:
wiki_datasets["train"].shape[0] / 20

188389.35

In [19]:
sample_ner = gpu_spacy_process(10, range(20000), 2000)

Map (num_proc=10):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [25]:
sample_ner = gpu_spacy_process(10, range(20000), 2000)

Map (num_proc=10):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [30]:
sample_ner = gpu_spacy_process(10, range(200000), 2000)

Map (num_proc=10):   0%|          | 0/200000 [00:00<?, ? examples/s]

In [32]:
sample_ner = gpu_spacy_process(1, range(20000), 20000)

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [34]:
sample_ner = gpu_spacy_process(10, range(20000), 1000)

Map (num_proc=10):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [36]:
sample_ner = gpu_spacy_process(10, range(20000), 1000)

Map (num_proc=10):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [37]:
sample_ner = gpu_spacy_process(10, range(20000), 2000)

Map (num_proc=10):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [9]:
sample_ner = gpu_spacy_process(10, range(20000), 2000)

Map (num_proc=10):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [10]:
sample_ner = gpu_spacy_process(10, range(20000), 1000)

Map (num_proc=10):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [11]:
sample_ner = gpu_spacy_process(10, range(20000), 100)

Map (num_proc=10):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [13]:
sample_ner = gpu_spacy_process(10, range(20000), 100)

Map (num_proc=10):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [14]:
sample_ner = gpu_spacy_process(10, range(20000), 1000)

Map (num_proc=10):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [15]:
sample_ner = gpu_spacy_process(10, range(20000), 2000)

Map (num_proc=10):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [16]:
sample_ner = gpu_spacy_process(10, range(40000), 4000)

Map (num_proc=10):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [17]:
sample_ner = gpu_spacy_process(12, range(20000), 2000)

Map (num_proc=12):   0%|          | 0/20000 [00:00<?, ? examples/s]

In [18]:
sample_ner = gpu_spacy_process(12, range(40000), 2000)

Map (num_proc=12):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [19]:
sample_ner = gpu_spacy_process(10, range(40000), 2000)

Map (num_proc=10):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [21]:
sample_ner = gpu_spacy_process(10, range(40000), 1000)

Map (num_proc=10):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [22]:
sample_ner = gpu_spacy_process(10, range(40000), 500)

Map (num_proc=10):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [23]:
sample_ner = gpu_spacy_process(10, range(40000), 200)

Map (num_proc=10):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [24]:
sample_ner = gpu_spacy_process(10, range(40000), 50)

Map (num_proc=10):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [25]:
sample_ner = gpu_spacy_process(10, range(40000), 300)

Map (num_proc=10):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [26]:
sample_ner = gpu_spacy_process(10, range(40000), 400)

Map (num_proc=10):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [27]:
sample_ner = gpu_spacy_process(10, range(40000), 250)

Map (num_proc=10):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [28]:
sample_ner = gpu_spacy_process(12, range(40000), 100)

Map (num_proc=12):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [29]:
sample_ner = gpu_spacy_process(12, range(40000), 225)

Map (num_proc=12):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [4]:
sample_ner = gpu_spacy_process(10, range(40000), 200)

Map (num_proc=10):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [9]:
sample_ner = gpu_spacy_process(10, range(40000), 250)

Map (num_proc=10):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [10]:
sample_ner = gpu_spacy_process(12, range(40000), 250)

Map (num_proc=12):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [11]:
sample_ner = gpu_spacy_process(10, range(40000), 250)

Map (num_proc=11):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [15]:
sample_ner = gpu_spacy_process(10, range(40000), 250)

Map (num_proc=10):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [16]:
sample_ner = gpu_spacy_process(10, range(40000), 250)

Map (num_proc=10):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [18]:
sample_ner = gpu_spacy_process(10, range(40000), 200)

Map (num_proc=10):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [19]:
sample_ner = gpu_spacy_process(10, range(40000), 250)

Map (num_proc=10):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [20]:
sample_ner = gpu_spacy_process(8, range(40000), 250)

Map (num_proc=8):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [21]:
sample_ner = gpu_spacy_process(6, range(40000), 250)

Map (num_proc=6):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [22]:
sample_ner = gpu_spacy_process(8, range(40000), 350)

Map (num_proc=8):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [23]:
sample_ner = gpu_spacy_process(8, range(40000), 500)

Map (num_proc=8):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [24]:
sample_ner = gpu_spacy_process(9, range(40000), 500)

Map (num_proc=9):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [25]:
sample_ner = gpu_spacy_process(10, range(40000), 500)

Map (num_proc=10):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [26]:
sample_ner = gpu_spacy_process(9, range(40000), 250)

Map (num_proc=9):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [27]:
sample_ner = gpu_spacy_process(9, range(40000), 400)

Map (num_proc=9):   0%|          | 0/40000 [00:00<?, ? examples/s]

In [28]:
sample_ner = gpu_spacy_process(9, range(wiki_datasets["train"].shape[0]), 400)

Map (num_proc=9):   0%|          | 0/3767787 [00:00<?, ? examples/s]

In [29]:
sample_ner

Dataset({
    features: ['id', 'url', 'title', 'text', 'ner'],
    num_rows: 3767787
})

In [31]:
sample_ner.save_to_disk("C:\.cache\huggingface\datasets\wiki_ner_100_1000")

Saving the dataset (0/25 shards):   0%|          | 0/3767787 [00:00<?, ? examples/s]

In [2]:
def process_ds(dataset, output_dir, num_proc = 1, ranges = range(100), batch_size = 1000):
    
    def gpu_spacy_process(dataset, ranges = range(100), batch_size = 1000):

        from thinc.api import set_gpu_allocator, require_gpu

        import spacy
        def process(examples):

            require_gpu(0)
            # Use the GPU, with memory allocations directed via PyTorch.
            # This prevents out-of-memory errors that would otherwise occur from competing
            # memory pools.
            #set_gpu_allocator("pytorch")
            #require_gpu(1 if random.random() < 0.3 else 0)
            try:
                spacy_pipeline = spacy.load('en_core_web_sm', enable=["ner"])
            except OSError:
                logging.warning("Downloading language model for the spaCy model.")
                from spacy.cli import download
                download('en_core_web_sm')
                spacy_pipeline = spacy.load('en_core_web_sm')

            texts = examples["article"]
            list_entities = []

            spacy_docs = spacy_pipeline.pipe(texts)
            for doc in spacy_docs:
                list_entities.append([a.text for a in doc.ents])

            examples["ner"] = list_entities
            return examples

        return dataset.select(ranges).map(
            process,
            batched=True,
            num_proc=1,
            batch_size=batch_size
        )
    
    def pre_process_dataset(dataset, num_proc = 1, ranges = range(100), batch_size = 1000):
        import string
        import random


        import pandas as pd
        import random    
        import nltk
        import math

        import evaluate
        import numpy as np

        from datasets import load_dataset, load_metric
        data_args = {
            "ner_mlm": True,
            "ner_sgs_mlm": False,
            "ner_mlm_prob": 0.6
        }

        NER_MASK = "<mask1>"
        NER_TOKEN_MASK = "<mask2>"
        MLM_CONNECTOR = "<conn1>"
        MLM_SGS_CONNECTOR = "<conn2>"

        text_column = "article"
        summary_column = "abstract"

        #bertscore = load_metric('bertscore')

        rouge = evaluate.load("rouge")


        def get_other_text(sentence, text):
            res = text.replace(sentence, "", 1)
            if (len(res) == 0 and len(res.split()) == 0):
                res = text
            return res

        def extract_entties_with_spacy(source, spacy_pipeline):

            list_entities = []

            spacy_doc = spacy_pipeline(source)#, n_process=-1)
            list_entities = [a.text for a in spacy_doc.ents]

            return list_entities


        def mask_sentence(sentence, entities, prob_ner, prob_token):

            tokens_sentence = sentence.translate(str.maketrans('', '', string.punctuation))
            processed = sentence
            tokens_sentnece_list = tokens_sentence.split()

            tokens_sentnece_list_masking = np.zeros(len(tokens_sentnece_list))

            for entity in entities:
                if entity in sentence:
                    tokens_sentence = tokens_sentence.replace(entity, "")
                    if random.random() <= prob_ner:
                        entity_tokens = entity.split()
                        entity_length = len(entity_tokens)
                        for tokens_index in range(len(tokens_sentnece_list)):
                            tokens_seq = " ".join(tokens_sentnece_list[tokens_index:tokens_index + entity_length])
                            entity_seq = entity
                            if tokens_seq == entity_seq:
                                tokens_sentnece_list_masking[tokens_index:tokens_index + entity_length] = 2

            for token_index in range(len(tokens_sentnece_list)):

                token = tokens_sentnece_list[token_index]

                if tokens_sentnece_list_masking[token_index] == 2:
                    # NER MASK
                    continue

                if random.random() <= prob_token:
                    tokens_sentnece_list_masking[token_index] = 1
                else :
                    tokens_sentnece_list_masking[token_index] = 0

            result = []

            for i in range(len(tokens_sentnece_list_masking)):
                if tokens_sentnece_list_masking[i] == 2:
                    # if prev is mask2 then do not repeat
                    if i > 0 and tokens_sentnece_list_masking[i - 1] == 2:
                        continue

                    result.append(NER_MASK)

                elif tokens_sentnece_list_masking[i] == 1:
                    # if prev is mask1 then do not repeat
                    if i > 0 and tokens_sentnece_list_masking[i - 1] == 1:
                        result.append(NER_TOKEN_MASK)

                else:
                    result.append(tokens_sentnece_list[i])

            processed = " ".join(result)

            label = " ".join(np.where(tokens_sentnece_list_masking, tokens_sentnece_list, "<unmasked>"))

            return [processed, label, tokens_sentnece_list_masking]

        def sentence_scorer(sentence, text, entities, rouge_score):

            # count entities
            entites_count = 0
            for entity in entities:
                if entity in sentence:
                    entites_count += 1
            entites_score = (entites_count * 1.0) / 4

            # bertscore
            #bertscore_results = bertscore.compute(predictions=[sentence], references=[text], lang='en')

    #        bertscore_value = np.average(bertscore_results["f1"])

            return entites_score + rouge_score# + bertscore_value


        def to_sentences(text):
            sentences = nltk.sent_tokenize(text)
            texts = [get_other_text(sent, text) for sent in sentences]

            return (texts, sentences)

        def custom_train_preprocess(examples):

            #batch_texts = ". ".join(examples[text_column])


            list_entities = []

            #list_entities = extract_entties_with_spacy(batch_texts, spacy_pipeline)

            all_texts = []
            all_sentences = []
            groups = []
            for i in range(len(examples[text_column])):

                preprocessed_exp = examples[text_column][i]
                
                
                texts, sentences = to_sentences(preprocessed_exp)
                start_index = len(all_texts)
            #   for tex in texts:
            #        all_texts.append(tex)
            #    for sen in sentences:
            #        all_sentences.append(sen)
                all_texts.extend(texts)
                all_sentences.extend(sentences)
                end_index = len(all_texts)
                groups.append((start_index, end_index, sentences))

            # compute rouge score for the whole batch

            result_rouge = rouge.compute(predictions=all_sentences, references=all_texts, rouge_types=["rouge1"], use_aggregator=False)
            rouge_scores = [score for score in result_rouge.get("rouge1")]
            #rouge_scores = [random.random() for sent in all_sentences]


            documents = []
            summaries = []
            mlm_labels = []
            for i in range(len(examples[text_column])):

                preprocessed_exp = examples[text_column][i]
                
                if len(preprocessed_exp) == 0:
                    mlm_labels.append("")
                    summaries.append("")
                    documents.append(preprocessed_exp)
                    
                    continue
    #            list_entities = extract_entties_with_spacy(preprocessed_exp, spacy_pipeline)
                list_entities = examples["ner"][i]

              #  texts, sentences = to_sentences(preprocessed_exp)

                try:
                    
                    sentences = groups[i][2]
                    this_rouge_scores = rouge_scores[groups[i][0]:groups[i][1]]
                except:
                    print(i)
                    print(groups[i])
                    raise("error")
             #   result_rouge = rouge.compute(predictions=sentences, references=texts, use_stemmer=True, use_aggregator=False)
              #  rouge_scores = [score.fmeasure for score in result_rouge.get("rouge1")]

                examples_data = {
                    "sentence": sentences,
               #     "rouge1": rouge_scores
                    "rouge1": this_rouge_scores
                }

                df = pd.DataFrame(examples_data)
                df["score"] = df.apply(lambda x: sentence_scorer(x["sentence"], preprocessed_exp, list_entities, x["rouge1"]), axis=1)
                df.sort_values("score", ascending=False, inplace=True)
                df["normalized_score"] = df["score"].map(lambda x: x / (df["score"].max() + 0.001))
                masked_labels = df.apply(
                    lambda x: 
                        mask_sentence(x["sentence"], list_entities, data_args["ner_mlm_prob"], x["normalized_score"] * (0.1 + data_args["ner_mlm_prob"]))
                              , 
                    axis=1)
                
                try:
#                    print(masked_labels)
                    df["masked_labels"] = masked_labels
                except:
                    print(i)
                    print("preprocessed", preprocessed_exp, "end")
                    raise 'error'
                    
                df["masked"] = df["masked_labels"].map(lambda x: x[0] )
                df["labels"] = df["masked_labels"].map(lambda x: x[1])

                #df["num_masked1"] = df["masked"].map(lambda x: x.count("<mask1>"))
                #df["num_masked2"] = df["masked"].map(lambda x: x.count("<mask2>"))
                #df["length"] = df["sentence"].map(lambda x: len(x.split()))
                #df["length_masked"] = df["masked"].map(lambda x: len(x.split()))
                #df["masked_prob"] = df.apply(lambda x: ((x["num_masked2"] + x["num_masked1"]) + (x["length"] - x["length_masked"])) / x["length"], axis=1)


                #print(df)

                if data_args["ner_mlm"]:
                    # mlm for faithfull
                    summary = " ".join(df["sentence"].sort_index( ))
                elif data_args["ner_sgs_mlm"]:
                    # mlm-sgs for faithfull
                    num_sentences = df.shape[0] * 0.25
                    num_sentences = int(round(num_sentences, 0))
                    num_sentences = 1 if num_sentences == 0 else num_sentences
                    summary = " ".join(df.head(num_sentences).sort_index( )["sentence"])

                preprocessed_exp = " ".join(df["masked"].sort_index())
                if data_args["ner_mlm"]:
                    preprocessed_exp = " ".join([MLM_CONNECTOR, preprocessed_exp])
                elif  data_args["ner_sgs_mlm"]:
                    preprocessed_exp = " ".join([MLM_CONNECTOR, MLM_SGS_CONNECTOR, preprocessed_exp])


                #print(df.sort_index())
                #print(df.describe())

                mlm_labels.append(" ".join(df["labels"].sort_index( )))
                summaries.append(summary)
                documents.append(preprocessed_exp)

            new_examples = {}
            new_examples[text_column] = documents
            new_examples[summary_column] = summaries
            new_examples["mlm_label"] = mlm_labels
            return new_examples

        return dataset.select(ranges).map(
            custom_train_preprocess,
            batched=True,
            num_proc=num_proc,
            batch_size=batch_size
        )
    
    
    ner_dataset = gpu_spacy_process(dataset, ranges, batch_size)
    
    processed_dataset = pre_process_dataset(ner_dataset, num_proc, range(len(ner_dataset)), batch_size)
    
    processed_dataset.save_to_disk(output_dir)   
    
    return processed_dataset
                
#gpu_spacy_process(2, 20000, 1000)

In [58]:
#from datasets import load_from_disk

xsum_dataset = load_dataset(
    "xsum"
)

Downloading builder script:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.24k [00:00<?, ?B/s]

In [13]:
gpu_spacy_process(xsum_dataset["train"], 2, range(20000), 1000)

Map (num_proc=2):   0%|          | 0/20000 [00:00<?, ? examples/s]

Dataset({
    features: ['document', 'summary', 'id', 'ner'],
    num_rows: 20000
})

In [71]:
output_dir = r"G:\.cache\huggingface\datasets\mlm\xsum_slg_mlm"
sample = process_ds(
    xsum_dataset["train"], 
    output_dir, 
    10, 
    range(len(xsum_dataset["train"])), 
    1000
)

Map (num_proc=2):   0%|          | 0/204045 [00:00<?, ? examples/s]

Map (num_proc=10):   0%|          | 0/204045 [00:00<?, ? examples/s]

Saving the dataset (0/4 shards):   0%|          | 0/204045 [00:00<?, ? examples/s]

In [8]:
output_dir = r"G:\.cache\huggingface\datasets\mlm\xsum_slg_mlm"
sample = process_ds(
    xsum_dataset["train"], 
    output_dir, 
    10, 
    range(len(xsum_dataset["train"])), 
    1000
)

Map (num_proc=10):   0%|          | 0/204045 [00:00<?, ? examples/s]

Saving the dataset (0/3 shards):   0%|          | 0/204045 [00:00<?, ? examples/s]

In [7]:
sample[0]

 'summary': "The full cost of damage in Newton Stewart, one of the areas worst affected, is still being assessed. Repair work is ongoing in Hawick and many roads in Peeblesshire remain badly affected by standing water. Many businesses and householders were affected by flooding in Newton Stewart after the River Cree overflowed into the town. Scottish Borders Council has put a list on its website of the roads worst affected and drivers have been urged not to ignore closure signs. The Labour Party's deputy Scottish leader Alex Rowley was in Hawick on Monday to see the situation first hand. Have you been affected by flooding in Dumfries and Galloway or the Borders?",
 'id': '35232142',
 'ner': ['Newton Stewart',
  'one',
  'Repair',
  'Hawick',
  'Peeblesshire',
  'Trains',
  'west coast',
  'the Lamington Viaduct',
  'Newton Stewart',
  'the River Cree',
  'First',
  'Nicola Sturgeon',
  'Victoria Street',
  'Jeanette Tate',
  'the Cinnamon Cafe',
  'Dumfries and the Nith',
  'the last fe

In [7]:
dataset = load_dataset(
    "ccdv/arxiv-summarization",
)

In [6]:
dataset

DatasetDict({
    train: Dataset({
        features: ['article', 'abstract'],
        num_rows: 203037
    })
    validation: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6436
    })
    test: Dataset({
        features: ['article', 'abstract'],
        num_rows: 6440
    })
})

In [None]:
output_dir = r"G:\.cache\huggingface\datasets\mlm\arxiv"
sample = process_ds(
    dataset["train"], 
    output_dir, 
    10, 
    range(len(dataset["train"])), 
    100
)

Map:   0%|          | 0/203037 [00:00<?, ? examples/s]

In [8]:
def process_ds(dataset, output_dir, num_proc = 1, ranges = range(100), batch_size = 1000):
    
    def max_length(dataset, num_proc, ranges, batch_size):
        x = []
        def process(examples):
            articles = []
            for e in examples["article"]:
                articles.append(
                    " ".join(e.split()[0:1024])
                )

            examples["article"] = articles
        
            return examples
        
        return dataset.select(ranges).map(
            process,
            batched=True,
            num_proc=num_proc,
            batch_size=batch_size
        )
    
    def gpu_spacy_process(dataset, ranges = range(100), batch_size = 1000):

        from thinc.api import set_gpu_allocator, require_gpu

        import spacy
        def process(examples):

            require_gpu(0)
            # Use the GPU, with memory allocations directed via PyTorch.
            # This prevents out-of-memory errors that would otherwise occur from competing
            # memory pools.
            #set_gpu_allocator("pytorch")
            #require_gpu(1 if random.random() < 0.3 else 0)
            try:
                spacy_pipeline = spacy.load('en_core_web_sm', enable=["ner"])
            except OSError:
                logging.warning("Downloading language model for the spaCy model.")
                from spacy.cli import download
                download('en_core_web_sm')
                spacy_pipeline = spacy.load('en_core_web_sm')

            texts = examples["article"]
            list_entities = []

            spacy_docs = spacy_pipeline.pipe(texts)
            for doc in spacy_docs:
                list_entities.append([a.text for a in doc.ents])

            examples["ner"] = list_entities
            return examples

        return dataset.select(ranges).map(
            process,
            batched=True,
            num_proc=2,
            batch_size=batch_size
        )
    
    def pre_process_dataset(dataset, num_proc = 1, ranges = range(100), batch_size = 1000):
        import string
        import random


        import pandas as pd
        import random    
        import nltk
        import math

        import evaluate
        import numpy as np

        from datasets import load_dataset, load_metric
        data_args = {
            "ner_mlm": True,
            "ner_sgs_mlm": False,
            "ner_mlm_prob": 0.35
        }

        NER_MASK = "<mask1>"
        NER_TOKEN_MASK = "<mask2>"
        MLM_CONNECTOR = "<conn1>"
        MLM_SGS_CONNECTOR = "<conn2>"

        text_column = "article"
        summary_column = "abstract"

        #bertscore = load_metric('bertscore')

        rouge = evaluate.load("rouge")


        def get_other_text(sentence, text):
            res = text.replace(sentence, "", 1)
            if (len(res) == 0 and len(res.split()) == 0):
                res = text
            return res

        def extract_entties_with_spacy(source, spacy_pipeline):

            list_entities = []

            spacy_doc = spacy_pipeline(source)#, n_process=-1)
            list_entities = [a.text for a in spacy_doc.ents]

            return list_entities


        def mask_sentence(sentence, entities, prob_ner, prob_token):

            tokens_sentence = sentence.translate(str.maketrans('', '', string.punctuation))
            processed = sentence
            tokens_sentnece_list = tokens_sentence.split()

            tokens_sentnece_list_masking = np.zeros(len(tokens_sentnece_list))

            for entity in entities:
                if entity in sentence:
                    tokens_sentence = tokens_sentence.replace(entity, "")
                    if random.random() <= prob_ner:
                        entity_tokens = entity.split()
                        entity_length = len(entity_tokens)
                        for tokens_index in range(len(tokens_sentnece_list)):
                            tokens_seq = " ".join(tokens_sentnece_list[tokens_index:tokens_index + entity_length])
                            entity_seq = entity
                            if tokens_seq == entity_seq:
                                tokens_sentnece_list_masking[tokens_index:tokens_index + entity_length] = 2

            for token_index in range(len(tokens_sentnece_list)):

                token = tokens_sentnece_list[token_index]

                if tokens_sentnece_list_masking[token_index] == 2:
                    # NER MASK
                    continue

                if random.random() <= prob_token:
                    tokens_sentnece_list_masking[token_index] = 1
                else :
                    tokens_sentnece_list_masking[token_index] = 0

            result = []

            for i in range(len(tokens_sentnece_list_masking)):
                if tokens_sentnece_list_masking[i] == 2:
                    # if prev is mask2 then do not repeat
                    if i > 0 and tokens_sentnece_list_masking[i - 1] == 2:
                        continue

                    result.append(NER_MASK)

                elif tokens_sentnece_list_masking[i] == 1:
                    # if prev is mask1 then do not repeat
                    if i > 0 and tokens_sentnece_list_masking[i - 1] == 1:
                        result.append(NER_TOKEN_MASK)

                else:
                    result.append(tokens_sentnece_list[i])

            processed = " ".join(result)

            label = " ".join(np.where(tokens_sentnece_list_masking, tokens_sentnece_list, "<unmasked>"))

            return [processed, label, tokens_sentnece_list_masking]

        def sentence_scorer(sentence, text, entities, rouge_score):

            # count entities
            entites_count = 0
            for entity in entities:
                if entity in sentence:
                    entites_count += 1
            entites_score = (entites_count * 1.0) / 4

            # bertscore
            #bertscore_results = bertscore.compute(predictions=[sentence], references=[text], lang='en')

    #        bertscore_value = np.average(bertscore_results["f1"])

            return entites_score + rouge_score# + bertscore_value


        def to_sentences(text):
            sentences = nltk.sent_tokenize(text)
            texts = [get_other_text(sent, text) for sent in sentences]

            return (texts, sentences)

        def custom_train_preprocess(examples):

            #batch_texts = ". ".join(examples[text_column])


            list_entities = []

            #list_entities = extract_entties_with_spacy(batch_texts, spacy_pipeline)

            all_texts = []
            all_sentences = []
            groups = []
            for i in range(len(examples[text_column])):

                preprocessed_exp = examples[text_column][i]
                
                
                texts, sentences = to_sentences(preprocessed_exp)
                start_index = len(all_texts)
            #   for tex in texts:
            #        all_texts.append(tex)
            #    for sen in sentences:
            #        all_sentences.append(sen)
                all_texts.extend(texts)
                all_sentences.extend(sentences)
                end_index = len(all_texts)
                groups.append((start_index, end_index, sentences))

            # compute rouge score for the whole batch

            result_rouge = rouge.compute(predictions=all_sentences, references=all_texts, rouge_types=["rouge1"], use_aggregator=False)
            rouge_scores = [score for score in result_rouge.get("rouge1")]
            #rouge_scores = [random.random() for sent in all_sentences]


            documents = []
            summaries = []
            mlm_labels = []
            masked_docuemnts = []
            for i in range(len(examples[text_column])):
                
                document = ""
                summary = ""
                masked_document = ""
                mlm_label = ""
                
                preprocessed_exp = examples[text_column][i]
                
                if len(preprocessed_exp) == 0:
                    mlm_labels.append("")
                    summaries.append(examples[summary_column][i])
                    documents.append(preprocessed_exp)
                    masked_docuemnts.append("")
                    continue
    #            list_entities = extract_entties_with_spacy(preprocessed_exp, spacy_pipeline)
                list_entities = examples["ner"][i]

              #  texts, sentences = to_sentences(preprocessed_exp)

                try:
                    
                    sentences = groups[i][2]
                    this_rouge_scores = rouge_scores[groups[i][0]:groups[i][1]]
                except:
                    print(i)
                    print(groups[i])
                    raise("error")
             #   result_rouge = rouge.compute(predictions=sentences, references=texts, use_stemmer=True, use_aggregator=False)
              #  rouge_scores = [score.fmeasure for score in result_rouge.get("rouge1")]

                examples_data = {
                    "sentence": sentences,
               #     "rouge1": rouge_scores
                    "rouge1": this_rouge_scores
                }

                df = pd.DataFrame(examples_data)
                df["score"] = df.apply(lambda x: sentence_scorer(x["sentence"], preprocessed_exp, list_entities, x["rouge1"]), axis=1)
                df.sort_values("score", ascending=False, inplace=True)
                df["normalized_score"] = df["score"].map(lambda x: x / (df["score"].max() + 0.001))
                masked_labels = df.apply(
                    lambda x: 
                        mask_sentence(x["sentence"], list_entities, data_args["ner_mlm_prob"], x["normalized_score"] * (0.1 + data_args["ner_mlm_prob"]))
                              , 
                    axis=1)
                
                try:
#                    print(masked_labels)
                    df["masked_labels"] = masked_labels
                except:
                    print(i)
                    print("preprocessed", preprocessed_exp, "end")
                    raise 'error'
                    
                df["masked"] = df["masked_labels"].map(lambda x: x[0] )
                df["labels"] = df["masked_labels"].map(lambda x: x[1])

                #df["num_masked1"] = df["masked"].map(lambda x: x.count("<mask1>"))
                #df["num_masked2"] = df["masked"].map(lambda x: x.count("<mask2>"))
                #df["length"] = df["sentence"].map(lambda x: len(x.split()))
                #df["length_masked"] = df["masked"].map(lambda x: len(x.split()))
                #df["masked_prob"] = df.apply(lambda x: ((x["num_masked2"] + x["num_masked1"]) + (x["length"] - x["length_masked"])) / x["length"], axis=1)


                #print(df)

                if data_args["ner_mlm"]:
                    # mlm for faithfull
                    num_sentences = 0
                    check_words = ""
                    for index in df.index:
                        check_words = " ".join([check_words, df.loc[index]["sentence"]])
                        num_sentences += 1
                        if (len(check_words.split()) > 512):
                            break

                    selectedDF = df.head(num_sentences).sort_index()
                    document =  " ".join(selectedDF["sentence"])
                    summary =  examples[summary_column][i]
                    masked_document =  " ".join(selectedDF["masked"])
                    mlm_label =  " ".join(selectedDF["labels"])
                    #print(document)
                    #print(summary)
                    #document = 
                    #summary = " ".join(df["sentence"].sort_index( ))
                    
           #     elif data_args["ner_sgs_mlm"]:
           #         # mlm-sgs for faithfull
           #         num_sentences = df.shape[0] * 0.25
           #         num_sentences = int(round(num_sentences, 0))
           #         num_sentences = 1 if num_sentences == 0 else num_sentences
           #         summary = " ".join(df.head(num_sentences).sort_index( )["sentence"])

                #preprocessed_exp = " ".join(df["masked"].sort_index())
                if data_args["ner_mlm"]:
                    masked_document = " ".join([MLM_CONNECTOR, masked_document])
                #elif  data_args["ner_sgs_mlm"]:
                 #   preprocessed_exp = " ".join([MLM_CONNECTOR, MLM_SGS_CONNECTOR, preprocessed_exp])


                #print(df.sort_index())
                #print(df.describe())

                #mlm_labels.append(" ".join(df["labels"].sort_index( )))
                mlm_labels.append(mlm_label)
                masked_docuemnts.append(masked_document)
                summaries.append(summary)
                documents.append(document)

            new_examples = {}
            new_examples[text_column] = documents
            new_examples[summary_column] = summaries
            new_examples["mlm_label"] = mlm_labels
            new_examples["masked_document"] = masked_docuemnts
            return new_examples

        return dataset.select(ranges).map(
            custom_train_preprocess,
            batched=True,
            num_proc=num_proc,
            batch_size=batch_size
        )
    
    maxed_length_dataset = max_length(dataset, num_proc, ranges, batch_size)
    
    ner_dataset = gpu_spacy_process(maxed_length_dataset, range(len(maxed_length_dataset)), batch_size)
    
    processed_dataset = pre_process_dataset(ner_dataset, num_proc, range(len(ner_dataset)), batch_size)
    
    processed_dataset.save_to_disk(output_dir)   
    
    return processed_dataset
                
#gpu_spacy_process(2, 20000, 1000)

In [9]:
output_dir = r"G:\.cache\huggingface\datasets\mlm\arxiv_0.3_1024"
sample = process_ds(
    dataset["train"], 
    output_dir, 
    10, 
    range(len(dataset["train"])), 
    1000
)

Map (num_proc=10):   0%|          | 0/203037 [00:00<?, ? examples/s]

Saving the dataset (0/5 shards):   0%|          | 0/203037 [00:00<?, ? examples/s]

In [10]:
output_dir = r"G:\.cache\huggingface\datasets\mlm\arxiv_val_0.3_1024"
sample = process_ds(
    dataset["validation"], 
    output_dir, 
    10, 
    range(len(dataset["validation"])), 
    1000
)

Map (num_proc=10):   0%|          | 0/6436 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6436 [00:00<?, ? examples/s]

In [11]:
output_dir = r"G:\.cache\huggingface\datasets\mlm\arxiv_test_0.3_1024"
sample = process_ds(
    dataset["test"], 
    output_dir, 
    10, 
    range(len(dataset["test"])), 
    1000
)

Map (num_proc=10):   0%|          | 0/6440 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/6440 [00:00<?, ? examples/s]

In [156]:
sample

Dataset({
    features: ['article', 'abstract', 'ner', 'mlm_label', 'masked_document'],
    num_rows: 6440
})

In [147]:
sample[1]["abstract"]

'we have studied the leptonic decay @xmath0 , via the decay channel @xmath1 , using a sample of tagged @xmath2 decays collected near the @xmath3 peak production energy in @xmath4 collisions with the cleo - c detector . \n we obtain @xmath5 and determine the decay constant @xmath6  mev , where the first uncertainties are statistical and the second are systematic .'

In [146]:
sample[1]["article"]

'the leptonic decays of a charged pseudoscalar meson @xmath7 are processes of the type @xmath8 , where @xmath9 , @xmath10 , or @xmath11 . because no strong interactions are present in the leptonic final state @xmath12 , such decays provide a clean way to probe the complex , strong interactions that bind the quark and antiquark within the initial - state meson . in these decays , strong interaction effects can be parametrized by a single quantity , @xmath13 , the pseudoscalar meson decay constant . the leptonic decay rate can be measured by experiment , and the decay constant can be determined by the equation ( ignoring radiative corrections ) @xmath14 where @xmath15 is the fermi coupling constant , @xmath16 is the cabibbo - kobayashi - maskawa ( ckm ) matrix @xcite element , @xmath17 is the mass of the meson , and @xmath18 is the mass of the charged lepton . the quantity @xmath13 describes the amplitude for the @xmath19 and @xmath20-quarks within the @xmath21 to have zero separation , 