In [1]:
import numpy as np
from random import choices
from random import randrange


In [2]:
def read_and_process_text(filename):
    raw_text = open(filename).read().lower()
    translation_table = dict.fromkeys(map(ord, '\"\'()*-0123456789:;[]_`'), ' ')
    raw_text = raw_text.translate(translation_table)
    translation_table = dict.fromkeys(map(ord, '?!'), '.')
    raw_text = raw_text.translate(translation_table)
    raw_text = " ".join(raw_text.split())
    letters = sorted(list(set(raw_text)))
    list_sentences = filter(None, raw_text.split("."))
    return letters, list(map(lambda x: x + ".", list_sentences))

In [3]:

def markov_chain(seq_length, list_sentences, letters):
    table = {}
    for sentence in list_sentences:
        for i in range(0, len(sentence) - seq_length):
            seq_in = sentence[i:i + seq_length]
            seq_out = sentence[i + seq_length]
            if not (seq_in in table):
                cur_letters = {}
                for ch in letters:
                    cur_letters[ch] = 0
                table[seq_in] = cur_letters
            table[seq_in][seq_out] += 1
    for seq in table:
        tt = sum(table[seq].values())
        for ch in table[seq]:
            table[seq][ch] /= tt
    return table


In [4]:
def markov_gen(string, table, seq_length, letters, max_len=1000):
    if seq_length >= len(string):
        print("Cannot predict(too short init string)")
        return string
    last_string = string[-seq_length:]
    while True:
        if not (last_string in table):
            result = np.random.choice(letters)
        else:
            result = np.random.choice(letters, p=[table[last_string][ch] for ch in letters])
        string += result[0]
        if(result[0] == '.') or (len(string) > max_len):
            return string
        last_string = string[-seq_length:]


In [5]:
def markov_gen_arg_max(string, table, seq_length, letters, max_len=1000):
    if seq_length >= len(string):
        print("Cannot predict(too short init string)")
        return string
    last_string = string[-seq_length:]
    while True:
        if not (last_string in table):
            result = np.random.choice(letters)
        else:
            result = letters[np.argmax([table[last_string][ch] for ch in letters])]
        string += result
        if(result == '.') or (len(string) > max_len):
            return string
        last_string = string[-seq_length:]


In [6]:
letters, list_sentences = read_and_process_text("data/alice_in_wonderland.txt")
print("Total Sentences: ", len(list_sentences))
print("Total Vocab: ", len(letters))

Total Sentences:  1630
Total Vocab:  29


In [7]:
print(letters)

[' ', ',', '.', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [8]:
seq_length = 15


In [9]:
table = markov_chain(seq_length, list_sentences, letters)

In [17]:
string = "she was close behind it when she turned the corner,"
markov_gen_string = markov_gen(string, table, seq_length, letters)
print(markov_gen_string)
#  trying every door, she walked sadly down the middle, wondering how she was ever to get out again.


she was close behind it when she turned the corner, but the rabbit was no longer to be seen she found herself in a long, low hall, which was lit up by a row of lamps hanging from the roof.


In [15]:
markov_gen_string = markov_gen_arg_max(string, table, seq_length, letters)
print(markov_gen_string)


she was close behind it when she turned the corner, but the rabbit was no longer to be seen she found herself in a game of croquet she was playing against herself, for this curious child was very fond of pretending to be two people.


In [19]:
bad_string = "she was close behind it when she turned the corner "
markov_gen_string = markov_gen_arg_max(bad_string, table, seq_length, letters)
print(markov_gen_string)


she was close behind it when she turned the corner uelargxdsy.
