In [1]:
import numpy
import nltk
import spacy
import math
import random
nlp = spacy.load("en_core_web_sm")

### Preparing the text

In [2]:
text = open('parsed_text.txt').read()
SOS = '<SOS>'
EOS = '<EOS>'
UNK = '<UNK>'
n = 2                                             

In [3]:
def add_special_tokens(sentences, n):
    return [f"{(SOS + ' ') * numpy.clip(n-1, 1, 1000)}{sentence} {EOS}" for sentence in sentences]

In [4]:
def create_unknown_tokens(tokens):
    freqs = nltk.FreqDist(tokens)
    return [token if freqs[token] > 1 else UNK for token in tokens]

In [5]:
def preprocess_text(text, n):
    doc = nlp(text)
    sentences = [sentence.text.lower() for sentence in doc.sents]
    sentences = add_special_tokens(sentences, n)
    tokens = ' '.join(sentences).split(' ')
    tokens = create_unknown_tokens(tokens)
    return sentences, tokens

In [6]:
sentences, tokens = preprocess_text(text, n)

In [7]:
len(tokens)

3049

### N Grams

In [8]:
vocab = nltk.FreqDist(tokens)
vocab_size = len(vocab)

In [9]:
n_grams = nltk.ngrams(tokens, n)
n_vocab = nltk.FreqDist(n_grams)

In [10]:
m_grams = nltk.ngrams(tokens, n-1)
m_vocab = nltk.FreqDist(m_grams)

In [11]:
def smoothed_count(n_gram, n_count, k, m_vocab, vocab_size):
    m_gram = n_gram[:-1]
    m_count = m_vocab[m_gram]
    return -math.log((n_count + k) / (m_count + k * vocab_size))

In [12]:
probabilities = {n_gram: smoothed_count(n_gram, count, 1, m_vocab, vocab_size) for n_gram, count in n_vocab.items()}

In [13]:
len(probabilities)

406

### Generation

In [14]:
def choose_next_word(probabilities, m_gram, blacklist):
    candidates = {}
    blacklist = blacklist + [UNK]

    for n_gram, probability in probabilities.items():
        if n_gram[:-1] == m_gram:                                                                  # find n_grams based on m_gram
            candidate = n_gram[-1]
            if candidate not in blacklist:
                candidates[candidate] = probability                                                # check that new word is not in blacklist

    candidates = sorted(candidates.items(), key=lambda probability: probability[1], reverse=True)  # sort by probability

    if len(candidates) == 0:
        return (EOS, 1)                                                                            # if no other options EOS
    else:
        return random.choice(candidates[:min(len(candidates), 15)])

In [16]:
def generate_sentences(probabilities, n, n_sent, max_len = 25):
    for _ in range(n_sent):
        sentence = [SOS] * numpy.clip(n-1, 1, 1000)                                                 # create new sentenses with SOS
        prob = 1

        while sentence[-1] != EOS:
            prev_words = () if n == 1 else tuple(sentence[-(n-1):])                                 # find m_gram
            blacklist = sentence + [EOS]                                                            # create blacklist so all words are unique
            
            next_word, next_prob = choose_next_word(probabilities, prev_words, blacklist)
            sentence.append(next_word)
            prob += next_prob

            if len(sentence) >= max_len:
                sentence.append(EOS)

        print(' '.join(sentence), math.exp(-prob))


In [23]:
generate_sentences(probabilities, n, 5)

<SOS> data may <EOS> 2.905125753710693e-05
<SOS> work under the book on learning for artificial neurons used by theoretical and study in cognitive systems had come to pattern recognition continued outside <EOS> 6.961145935354397e-44
<SOS> other researchers were mostly with algorithms studied human cognitive systems had been <EOS> 1.1783461654611023e-22
<SOS> from data and computer program is to pattern recognition continued outside the mathematical models of human cognitive systems had come <EOS> 1.1867802031612743e-36
<SOS> by mathematical models which have been <EOS> 6.62916277420036e-12
