In [None]:
!pip install blingfire

In [None]:
import os
import time
import re
import gzip
import gc
from tqdm.auto import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed
import blingfire as bf
import numpy as np
from datasets import load_dataset

In [None]:
infobox_pattern = re.compile(r'\{\{Infobox [^}]+\}\}', flags=re.DOTALL)
sidebar_pattern = re.compile(r'\{\{Sidebar [^}]+\}\}', flags=re.DOTALL)
link_pattern = re.compile(r'\[\[([^|\]]+\|)?([^\]]+)\]\]')
references_pattern = re.compile(r'==\s*(References|External links|See also|Notes)\s*==.*', flags=re.DOTALL)
citation_needed_pattern = re.compile(r'\{\{citation needed[^}]*\}\}', flags=re.DOTALL)
cn_pattern = re.compile(r'\{\{cn\}\}', flags=re.DOTALL)
curly_braces_pattern = re.compile(r'\{\{[^}]+\}\}', flags=re.DOTALL)
whitespace_pattern = re.compile(r'\s+')

In [None]:
def preprocess_article(text: str) -> str:
    text = infobox_pattern.sub('', text)
    text = sidebar_pattern.sub('', text)
    text = link_pattern.sub(r'\2', text)
    text = references_pattern.sub('', text)
    text = citation_needed_pattern.sub('', text)
    text = cn_pattern.sub('', text)  # Short form of citation needed
    text = curly_braces_pattern.sub('', text)
    text = whitespace_pattern.sub(' ', text).strip()
    return text

def process_article(article_text: str, min_len: int, max_len: int) -> str:
    article_text = preprocess_article(article_text)
    if not article_text:
        return ""

    proper_sentences = []
    _, offsets = bf.text_to_sentences_and_offsets(article_text)

    for o in offsets:
        # Check if the length of the current sentence (calculated as end position - start position)
        # falls within the specified minimum and maximum length bounds.
        if not min_len <= o[1] - o[0] <= max_len:
            continue
        sentence = article_text[o[0]:o[1]]
        proper_sentences.append(sentence)
    return '\n'.join(proper_sentences)

def process_article_wrapper(args):
    return process_article(*args)

def process_wikipedia_dataset(wiki_dataset, output_dir, articles_per_file=1_000_000, batch_size=100):
    os.makedirs(output_dir, exist_ok=True)  # Ensure output directory exists
    file_count = 1
    article_count = 0
    out_f = gzip.open(f'{output_dir}/wikipedia_processed_{file_count}.txt.gz', 'wt', encoding='utf-8')

    with ProcessPoolExecutor() as executor:
        futures = {}
        for article in tqdm(wiki_dataset, desc='Processing Articles'):
            future = executor.submit(process_article_wrapper, (article['text'], 32, 2048))
            futures[future] = article['text']

            if len(futures) >= batch_size:
                for future in as_completed(futures):
                    sentences = future.result()
                    out_f.write(sentences + '\n')
                    article_count += 1

                    if article_count >= articles_per_file:
                        out_f.close()
                        file_count += 1
                        article_count = 0
                        out_f = gzip.open(f'{output_dir}/wikipedia_processed_{file_count}.txt.gz', 'wt', encoding='utf-8')
                    del futures[future]
                    break

        for future in as_completed(futures):
            sentences = future.result()
            out_f.write(sentences + '\n')
            article_count += 1

            if article_count >= articles_per_file:
                out_f.close()
                file_count += 1
                article_count = 0
                out_f = gzip.open(f'{output_dir}/wikipedia_processed_{file_count}.txt.gz', 'wt', encoding='utf-8')
    out_f.close()

In [None]:
wiki_dataset = load_dataset("wikipedia", "20220301.en", split='train')
print(f'Length of the Wikipedia dataset is {len(wiki_dataset):_} articles.')
output_dir = '/kaggle/working'
process_wikipedia_dataset(wiki_dataset, output_dir)

Downloading builder script:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/7.14k [00:00<?, ?B/s]

Downloading and preparing dataset wikipedia/20220301.en (download: 19.18 GiB, generated: 18.88 GiB, post-processed: Unknown size, total: 38.07 GiB) to /root/.cache/huggingface/datasets/wikipedia/20220301.en/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559...


Downloading:   0%|          | 0.00/15.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/20.3G [00:00<?, ?B/s]

Dataset wikipedia downloaded and prepared to /root/.cache/huggingface/datasets/wikipedia/20220301.en/2.0.0/aa542ed919df55cc5d3347f42dd4521d05ca68751f50dbc32bae2a7f1e167559. Subsequent calls will reuse this data.
Length of the Wikipedia dataset is 6_458_670 articles.


Processing Articles:   0%|          | 0/6458670 [00:00<?, ?it/s]