In [2]:
import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Optional

import datasets
import nltk  # Here to have a nice missing dependency error message early on
import numpy as np
from datasets import load_dataset, load_metric

import transformers
from filelock import FileLock
from transformers import (
    AutoConfig,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    MBart50Tokenizer,
    MBart50TokenizerFast,
    MBartTokenizer,
    MBartTokenizerFast,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    set_seed,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, is_offline_mode, send_example_telemetry
from transformers.utils.versions import require_version


from data import DataCollatorForSeq2SeqWithMultipleReferences
from BSF_Trainer import BSFTrainer
from trainer import CustomTrainer

import traceback


# Will error if the minimal version of Transformers is not installed. Remove at your own risks.
# check_min_version("4.21.0.dev0")

# require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

logger = logging.getLogger(__name__)

try:
    nltk.data.find("tokenizers/punkt")
except (LookupError, OSError):
    if is_offline_mode():
        raise LookupError(
            "Offline mode: run this script without TRANSFORMERS_OFFLINE first to download nltk data files"
        )
    with FileLock(".lock") as lock:
        nltk.download("punkt", quiet=True)

# A list of all multilingual tokenizer which require lang attribute.
MULTILINGUAL_TOKENIZERS = [MBartTokenizer, MBartTokenizerFast, MBart50Tokenizer, MBart50TokenizerFast]


In [2]:
config = AutoConfig.from_pretrained(
    "facebook/bart-large-xsum",
    cache_dir=None,
    revision="main",
    use_auth_token=None,
)
tokenizer = AutoTokenizer.from_pretrained(
    "facebook/bart-large-xsum",
    cache_dir=None,

    use_fast=False,#model_args.use_fast_tokenizer,
    revision="main",
    use_auth_token= None,
)
model = AutoModelForSeq2SeqLM.from_pretrained(
    "facebook/bart-large-xsum",
    from_tf=bool(".ckpt" in  "facebook/bart-large-xsum"),
    config=config,
    cache_dir=None,
    revision="main",
    use_auth_token=None,
)

model.resize_token_embeddings(len(tokenizer))

Embedding(50265, 1024)

In [20]:
len(tokenizer)

50269

In [19]:
tokenizer.add_special_tokens({ "additional_special_tokens": ["<mask1>", "<mask2>", "<conn_1>", "<conn_2>"] })

4

In [21]:
model.resize_token_embeddings(len(tokenizer))

Embedding(50269, 1024)

In [24]:
raw_datasets = load_dataset(
            "c4",
            "en"
        )

Downloading data files:   0%|          | 0/1024 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/321M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/321M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/321M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/321M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/321M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/321M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/321M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/321M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/315M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/318M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/319M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/320M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [21]:
print(len(model.model.decoder.layers))

12


In [26]:
raw_datasets = load_dataset(
            "gigaword",
            cache_dir="G:\.cache\huggingface\datasets"
        )

In [30]:
df = raw_datasets["train"].to_pandas()

In [34]:
df["doc_len"] = df["document"].map(lambda x: len(x.split()))
df["sum_len"] = df["summary"].map(lambda x: len(x.split()))


TypeError: 'Series' object is not callable

In [35]:
df.describe()

Unnamed: 0,doc_len,sum_len
count,3803957.0,3803957.0
mean,31.35319,8.229059
std,8.130143,2.419671
min,11.0,2.0
25%,26.0,7.0
50%,31.0,8.0
75%,36.0,10.0
max,99.0,45.0


In [36]:
cnn_datasets = load_dataset(
            "cnn_dailymail",
            "3.0.0"
#            cache_dir="G:\.cache\huggingface\datasets"
        )

Downloading builder script:   0%|          | 0.00/8.33k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/9.88k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

In [40]:
df_cnn = cnn_datasets["train"].to_pandas()
df_cnn["doc_len"] = df_cnn["article"].map(lambda x: len(x.split()))
df_cnn["sum_len"] = df_cnn["highlights"].map(lambda x: len(x.split()))


Unnamed: 0,doc_len,sum_len
count,287113.0,287113.0
mean,691.870326,51.574101
std,336.500292,21.256336
min,8.0,4.0
25%,443.0,38.0
50%,632.0,48.0
75%,877.0,60.0
max,2347.0,1296.0


In [38]:
df_cnn

Unnamed: 0,article,highlights,id
0,"LONDON, England (Reuters) -- Harry Potter star...",Harry Potter star Daniel Radcliffe gets £20M f...,42c027e4ff9730fbb3de84c1af0d2c506e41c3e4
1,Editor's note: In our Behind the Scenes series...,Mentally ill inmates in Miami are housed on th...,ee8871b15c50d0db17b0179a6d2beab35065f1e9
2,"MINNEAPOLIS, Minnesota (CNN) -- Drivers who we...","NEW: ""I thought I was going to die,"" driver sa...",06352019a19ae31e527f37f7571c6dd7f0c5da37
3,WASHINGTON (CNN) -- Doctors removed five small...,"Five small polyps found during procedure; ""non...",24521a2abb2e1f5e34e6824e0f9e56904a2b0e88
4,(CNN) -- The National Football League has ind...,"NEW: NFL chief, Atlanta Falcons owner critical...",7fe70cc8b12fab2d0a258fababf7d9c6b5e1262a
...,...,...,...
287108,"The nine-year-old daughter of a black, unarmed...","Rumain Brisbon, 34, was killed after Phoenix p...",279a12d3ee37b8109cc192a9e88115a5a631fb06
287109,Legalising assisted suicide is a slippery slop...,"Theo Boer, a European assisted suicide watchdo...",b5bc9d404a9a5d890c9fc26550b67e6d8d83241f
287110,A group calling itself 'The Women of the 99 Pe...,Ohio congressman criticised for 'condoning the...,500862586f925e406f8b662934e1a71bbee32463
287111,Most men enjoy a good pint of lager or real al...,The Black Country Ale Tairsters have been to 1...,32a1f9e5c37a938c0c0bca1a1559247b9c4334b2


In [41]:
df_cnn.describe()

Unnamed: 0,doc_len,sum_len
count,287113.0,287113.0
mean,691.870326,51.574101
std,336.500292,21.256336
min,8.0,4.0
25%,443.0,38.0
50%,632.0,48.0
75%,877.0,60.0
max,2347.0,1296.0


In [44]:
xsum_datasets = load_dataset(
            "xsum",
#            "3.0.0"
#            cache_dir="G:\.cache\huggingface\datasets"
        )
df_xsum = xsum_datasets["train"].to_pandas()
df_xsum["doc_len"] = df_xsum["document"].map(lambda x: len(x.split()))
df_xsum["sum_len"] = df_xsum["summary"].map(lambda x: len(x.split()))

In [46]:
df_xsum.describe()

Unnamed: 0,doc_len,sum_len
count,204045.0,204045.0
mean,373.864633,21.097645
std,304.632089,5.236819
min,0.0,1.0
25%,176.0,18.0
50%,295.0,21.0
75%,491.0,24.0
max,29189.0,70.0


In [3]:
wiki_datasets = load_dataset(
    "wikipedia",
    "20220301.en",
     cache_dir="G:\.cache\huggingface\datasets"
)

In [4]:
train_dataset = wiki_datasets["train"]

In [6]:
def keepBetween(x, mini, maxi):
    length = len(x.split())
    return length < maxi and length > mini

In [11]:
dataset_100_1000 = train_dataset.filter(lambda x: keepBetween(x["text"], 100, 1000))

Filter:   0%|          | 0/6458670 [00:00<?, ? examples/s]

In [14]:
dataset_new = wiki_datasets

In [15]:
dataset_new["train"] = dataset_100_1000

In [17]:
dataset_new.save_to_disk("G:\.cache\huggingface\datasets\wiki_100_1000")

Saving the dataset (0/24 shards):   0%|          | 0/3767787 [00:00<?, ? examples/s]

In [23]:
dataset_new["train"]

Dataset({
    features: ['id', 'url', 'title', 'text'],
    num_rows: 3767787
})

In [5]:
wiki_datasets = load_dataset(
    "wikihow",
    "all",
     cache_dir="G:\.cache\huggingface\datasets"
)

ManualDownloadError:                   The dataset wikihow with config all requires manual data.
                  Please follow the manual download instructions:
                     You need to manually download one of the wikihow files. An overview of which files to download can be seen at https://github.com/mahnazkoupaee/WikiHow-Dataset.
You need to download one the following two data files manually, depending on the version you want:
  1) all: https://ucsb.app.box.com/s/ap23l8gafpezf4tq3wapr6u8241zz358 and save the file under <path/to/folder>/wikihowAll.csv
  2) sep: https://ucsb.app.box.com/s/7yq601ijl1lzvlfu4rjdbbxforzd2oag and save the file under <path/to/folder>/wikihowSep.csv

The <path/to/folder> can e.g. be "~/manual_wikihow_data".

Wikihow can then be loaded for example using the following command `datasets.load_dataset("wikihow", "all", data_dir="<path/to/folder>")`.

                  Manual data can be loaded with:
                   datasets.load_dataset("wikihow", data_dir="<path/to/manual/data>")

In [6]:
from summarizer import Summarizer

body = 'Text body that you want to summarize with BERT'
body2 = 'Something else you want to summarize with BERT'
model = Summarizer()
model(body)
model(body2)

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

'Something else you want to summarize with BERT'

In [8]:
model("some long text")

''

In [10]:
wiki_datasets = load_dataset(
    "wikipedia",
    "20220301.en",
     cache_dir="G:\.cache\huggingface\datasets"
)

In [12]:
text = wiki_datasets["train"]["text"][0]

In [21]:
model(text, num_sentences=3)

  super()._check_params_vs_input(X, default_n_init=10)


'Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Aeschylus and Sophocles used the myth of Antigone to illustrate the conflict between rules set by the state and personal autonomy. Philosophy lecturer Andrew G. Fiala composed a list of common arguments against anarchism which includes critiques such as that anarchism is innately related to violence and destruction, not only in the pragmatic world, such as at protests, but in the world of ethics as well.'

In [36]:
def mapdataset():
    from summarizer import Summarizer

    body = 'Text body that you want to summarize with BERT'
    body2 = 'Something else you want to summarize with BERT'
    model = Summarizer()
    def extract_summaries(examples):
        summaries = []
        for text in examples["text"]:
            summary = model(text, num_sentences=3)
            summaries.append(summary)

        examples["summaries"] = summaries
        return examples
    
    res = wiki_datasets["train"].select(range(100)).map(
        extract_summaries,
        batched=True,
        num_proc=1,
        batch_size=100
)

In [37]:
mapdataset()

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)
  super()._check_params_vs_input(X, default_n_init=10)


  super()._check_params_vs_input(X, default_n_init=10)


KeyboardInterrupt: 