In [1]:
from nlpkf.data_processing import DataProcessor
import numpy as np

In [2]:
corpus = [
    "The elephant sneezed at the sight of potatoes. ",
    "Bats can see via echolocation. See the bat sight sneeze!",
    "Wondering, she opened the door to the studio.", " MOUNTAINS permitors."
]
text = "".join(corpus)

In [3]:
>>> import nltk
from nltk.corpus import gutenberg
>>> nltk.corpus.gutenberg.fileids()

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']

In [4]:
tok_kwargs = dict(remove_stopwords=True, use_stems=False, 
                  to_lowercase=True, use_lemma=True, remove_punctuation=False)

In [5]:
import time
start = time.time()
w2v = DataProcessor(tokenizer_kwargs=tok_kwargs, vectorizer_kwargs={"strip_accents" : "unicode"})
end = time.time()
print(end - start)

10.041623592376709


In [6]:
emma = gutenberg.raw('austen-emma.txt').split("\n")
#emma = [w2v.clean_text(s) for s in emma]

#emma = [s for s in emma if len(s) > 3]

In [7]:
clean_corpus =w2v.build_vocabulary(corpus, clean_corpus=True)
indexes = w2v.tokens_to_index(w2v.tokenize_corpus(clean_corpus))

tokens = w2v.to_ngrams(indexes, 3)
dataset = np.array([list(seq) for sentence in tokens for seq in sentence], dtype=np.int64)
X, y =  dataset[:, :-1], dataset[:, -1]

THE False
ELEPHANT False
SNEEZED False
AT False
THE False
SIGHT False
OF False
POTATOES False
. False
BATS False
CAN False
SEE False
VIA False
ECHOLOCATION False
. False
SEE False
THE False
BAT False
SIGHT False
SNEEZE False
! False
WONDERING False
, False
SHE False
OPENED False
THE False
DOOR False
TO False
THE False
STUDIO False
. False
$ False
MOUNTAINS False
PERMITORS False
. False
ELEPHANT False
SNEEZE False
SIGHT False
POTATO False
. False
BAT False
ECHOLOCATION False
. False
BAT False
SIGHT False
SNEEZE False
! False
WONDER False
, False
OPEN False
DOOR False
STUDIO False
. False
$$ False
MOUNTAIN False
PERMITOR False
. False
ELEPHANT False
SNEEZE False
SIGHT False
POTATO False
. False
BAT False
ECHOLOCATION False
. False
BAT False
SIGHT False
SNEEZE False
! False
WONDER False
, False
OPEN False
DOOR False
STUDIO False
. False
$$ False
MOUNTAIN False
PERMITOR False
. False


In [8]:
from nlpkf.models.ngram import NgramModel

ngram = NgramModel(w2v, 2, load_embedding=True, embedding_dim=300, hidden_size=256)


In [58]:
ngram.fit(emma[:], n_epochs=10, clean_corpus=True)

100%|██████████| 10/10 [00:51<00:00,  4.86s/it]


(array([[ 13, 265],
        [265, 131],
        [131, 449],
        ...,
        [  8, 380],
        [380, 663],
        [663, 810]]),
 array([131, 449,  81, ..., 663, 810, 532]),
 [25277.56155371666,
  22502.80252456665,
  21515.356505393982,
  21056.035797595978,
  20718.793347597122,
  20440.897978305817,
  20192.052221298218,
  19956.855314016342,
  19729.494696855545,
  19507.583028316498])

In [59]:
text = emma[25]
preds, data = ngram.predict(text, return_dataset=True)
text

'her to impose any restraint; and the shadow of authority being'

In [60]:
def evaluate_pred(preds, data, text, ngram):
    preds = ngram.array_to_words(preds)
    data =  ngram.array_to_words(data)
    text = "{}\n".format(text)
    for d, p in zip(data, preds):
        text += "in: {} | {}\n".format(" ".join(d), p)
    print(text)


In [61]:
evaluate_pred(preds, data, text, ngram)

her to impose any restraint; and the shadow of authority being
in: her to | the
in: to impose | ,
in: impose any | to
in: any restraint | ,
in: restraint ; | and
in: ; and | the
in: and the | the
in: the shadow | of
in: shadow of | the
in: of authority | ,
in: authority being | the



In [44]:
ngram.dataproc.add_to_vocab("pas")

In [None]:
emma[:100]

In [8]:
pred, dataset = ngram.predict(corpus[0], return_dataset=True)
corpus[0], ngram.array_to_words(pred), ngram.array_to_words(dataset)

('The elephant sneezed at the sight of potatoes. ',
 array(['sight', 'potato', '.', '.'], dtype='<U6'),
 array([['elephant', 'sneezed'],
        ['sneezed', 'sight'],
        ['sight', 'potato'],
        ['potato', '.']], dtype='<U8'))

In [26]:
self = ngram
tokens = self.dataproc.clean_text(corpus[0], return_tokens=True)
indexes = self.dataproc.tokens_to_index([tokens])
ngram_ix = self.dataproc.to_ngrams(indexes, self.context_size)
dataset = np.array([list(seq) for sentence in ngram_ix for seq in sentence],
                   dtype=np.int64)
preds = [self.model(torch.tensor([x], dtype=torch.long)).argmax(1).item() for x in dataset]


In [38]:
func = 

In [39]:
func(dataset)

array([['elephant', 'sneezed', 'sight'],
       ['sneezed', 'sight', 'potato'],
       ['sight', 'potato', '.']], dtype='<U8')

In [40]:
func(preds)

array(['potato', 'bat', 'bat'], dtype='<U6')

In [21]:
self.dataproc.vocabulary

{'elephant': 6,
 'sneezed': 14,
 'sight': 12,
 'potato': 10,
 '.': 2,
 'bat': 3,
 'see': 11,
 'via': 16,
 'echolocation': 5,
 'sneeze': 13,
 '!': 0,
 'wondering': 17,
 ',': 1,
 'opened': 8,
 'door': 4,
 'studio': 15,
 'mountain': 7,
 'permitors': 9}

In [43]:
import torch
torch.nn.NLLLoss?

In [59]:
import nltk
nltk.ngrams?