In [19]:
import os

from argparse import Namespace
import collections
import nltk.data
import numpy as np
import pandas as pd
import re
import string
import nltk
from tqdm.notebook import tqdm

In [8]:
#>>> nltk.download()

In [9]:
args = Namespace(
    raw_dataset_txt="../data/books/frankenstein/frankenstein.txt",
    window_size=5,
    train_prop=0.7,
    val_prop=0.15,
    test_prop=0.15,
    output_munged_csv="../data/books/frankenstein/frankenstein_with_splits.csv",
    seed=1337
)

In [10]:
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
with open(args.raw_dataset_txt) as fp:
    book = fp.read()
sentences = tokenizer.tokenize(book)

In [11]:
print(len(sentences))
print("Sample:", sentences[100])

3427
Sample: No incidents have hitherto befallen us that would make a figure in a
letter.


In [12]:
def preprocess_text(text):
    text = ' '.join(word.lower() for word in text.split(" "))
    text = re.sub(r"([.,!?])", r" \1 ", text)
    text = re.sub(r"[^a-zA-Z.,!?]+", r" ", text)
    return text

In [13]:
cleaned_sentences = [preprocess_text(sentence) for sentence in sentences]

In [18]:
cleaned_sentences[:4]

['frankenstein , or the modern prometheus by mary wollstonecraft godwin shelley letter st . petersburgh , dec . th , to mrs . saville , england you will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings . ',
 'i arrived here yesterday , and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking . ',
 'i am already far north of london , and as i walk in the streets of petersburgh , i feel a cold northern breeze play upon my cheeks , which braces my nerves and fills me with delight . ',
 'do you understand this feeling ? ']

In [15]:
MASK_TOKEN = "<MASK>"

In [21]:
# create windows
flatten = lambda outer_list: [item for inner_list in outer_list for item in inner_list]
windows = flatten([list(nltk.ngrams([MASK_TOKEN] * args.window_size + sentence.split(' ') + 
                                   [MASK_TOKEN] * args.window_size, args.window_size * 2 + 1
                                   )) 
                  for sentence in tqdm(cleaned_sentences)
                  ])

  0%|          | 0/3427 [00:00<?, ?it/s]

In [24]:
data = []
for window in tqdm(windows):
    target_token = window[args.window_size]
    context = []
    for i, token in enumerate(window):
        if token != MASK_TOKEN and i != args.window_size:
            context.append(token)
    data.append([' '.join(token for token in context), target_token])
cbow_data = pd.DataFrame(data, columns=['context', 'target'])

  0%|          | 0/90698 [00:00<?, ?it/s]

In [26]:
cbow_data.head(3)

Unnamed: 0,context,target
0,", or the modern prometheus",frankenstein
1,frankenstein or the modern prometheus by,","
2,"frankenstein , the modern prometheus by mary",or
