In [1]:
import requests
from argparse import Namespace
from pathlib import Path
from functools import partial
from typing import Iterator

# torch
from torchdata import datapipes as dp
from torchdata.datapipes import functional_datapipe
from torchdata.datapipes.iter import IterDataPipe

# spacy
import spacy
from spacy.language import Language
from spacy.lang.en import English
from spacy.tokens import Doc
nlp = English()

In [13]:
args = Namespace(
    # data
    ## Flag
    path_check = True,
    is_content = False,
    url = "https://www.gutenberg.org/files/84/84-0.txt",
    ## path and file name
    data_base_path = "../data/cbow/",
    filename = "frankenstein.txt",
    start_tkn = "*** START OF THE PROJECT GUTENBERG EBOOK",
    end_tkn = "*** END OF THE PROJECT GUTENBERG EBOOK ",
)

if args.path_check:
    for k,v in args._get_kwargs():
        if k.endswith("path"):
            Path(v).mkdir(parents=True,exist_ok=True)

In [3]:
def para_joiner_fn(line):
    return " ".join(line)

def drop_filename_fn(tuples):
    return tuples[1]

def filter_content(args,para):
    if args.start_tkn in para:
        args.is_content = True
    elif args.is_content and args.end_tkn in para:
        args.is_content = False
    return args.is_content

In [4]:
http_reader_dp  = dp.iter.HttpReader([args.url])
line_reader_dp  = http_reader_dp.readlines(encoding="utf-8-sig",decode=True)
para_dp = line_reader_dp.lines_to_paragraphs(para_joiner_fn)
drop_dp = para_dp.map(drop_filename_fn)
filter_dp = drop_dp.filter(partial(filter_content,args))

In [5]:
if False:
    with open(args.data_base_path+args.filename , "w") as f:
        for data in filter_dp:
            f.write(data+"\n")

In [2]:
@Language.component("preprocess")
def preprocess(doc):
    words = []
    space = []
    for token in doc:
        if token.is_alpha or token.text in [".","?","!",","]:
            words.append(token.lower_)
            space.append(True)
    return Doc(vocab=nlp.vocab,words=words,spaces=space)

In [3]:
nlp.add_pipe("preprocess")
nlp.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x19972b42380>

In [4]:
@functional_datapipe("get_sentence")
class SentencePipe(IterDataPipe):
    def __init__(self,datapipe,nlp) -> None:
        super().__init__()
        self.datapipe = datapipe
        self.nlp = nlp 
        
        
    def __iter__(self):
        for para in self.datapipe:
            doc = nlp(para)
            for sent in doc.sents:
                if len(sent)>2:
                    yield sent

In [5]:
@functional_datapipe("get_context_target")
class ContextTargetPipe(IterDataPipe):
    def __init__(self,datapipe,window_size=2) -> None:
        super().__init__()
        self.datapipe = datapipe
        self.window_size = window_size

        
    def __iter__(self):
        for sent in self.datapipe:
            for i,target in enumerate(sent):
                yield (sent[max(0,i-self.window_size):i].text_with_ws + sent[i+1:i+self.window_size+1].text_with_ws,
                       target.text)                      

In [6]:
opener = dp.iter.FileOpener(["../data/cbow/frankenstein.txt"])
reader =  opener.readlines(return_path= False)

In [7]:
sent_dp = reader.get_sentence(nlp)

In [8]:
next(iter(sent_dp))

start of the project gutenberg ebook frankenstein

In [9]:
con_dp = sent_dp.get_context_target()

In [10]:
len(list(con_dp))

83244

In [11]:
split = con_dp.random_split(weights={"train":0.7,"test":0.15,"val":0.15},
                            seed=0,total_length=83244)

In [14]:
for fname,pipe in zip(["train","test","val"],split):
    with open(args.data_base_path+f"{fname}.csv","w") as f:
        for data in pipe:
            f.write(f'{data[0].strip()}#{data[1]}\n')

In [97]:
next(iter(split[0]))

('start of project gutenberg ', 'the')