In [87]:
import re
import string
import collections
import unicodedata
import random
from IPython.core.display import display, HTML

In [2]:
sample_text = """Continuing this process, we obtain better and better approximations to the square root.
Now let's formalize the process in terms of procedures. We start with a value for the radicand (the
number whose square root we are trying to compute) and a value for the guess. If the guess is good
enough for our purposes, we are done; if not, we must repeat the process with an improved guess. We
write this basic strategy as a procedure:
(define (sqrt-iter guess x)
(if (good-enough? guess x)
guess
(sqrt-iter (improve guess x)
x)))
A guess is improved by averaging it with the quotient of the radicand and the old guess:
(define (improve guess x)
(average guess (/ x guess)))
where

(define (average x y)
(/ (+ x y) 2))
We also have to say what we mean by ''good enough.'' The following will do for illustration, but it is
not really a very good test. (See exercise 1.7.) The idea is to improve the answer until it is close
enough so that its square differs from the radicand by less than a predetermined tolerance (here
0.001): 22
(define (good-enough? guess x)
(< (abs (- (square guess) x)) 0.001))
Finally, we need a way to get started. For instance, we can always guess that the square root of any
number is 1: 23
(define (sqrt x)
(sqrt-iter 1.0 x))
If we type these definitions to the interpreter, we can use sqrt just as we can use any procedure:"""

small_text = '''A guess is improved by averaging it with the quotient of the radicand and the old guess:
(define (improve guess x)
((average guess (/ x guess)))'''

sentence_boundary = 'in terms of procedures. We start with 0.123 and some'

double_quotes = "let's see how such a ''circular'' definition"

In [78]:
cat = ''.join

In [3]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [4]:
string.ascii_letters + string.digits + string.punctuation

'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [5]:
token_pattern = re.compile(r'[^{}]+'.format(re.escape(string.ascii_letters + string.digits + string.punctuation)))
token_pattern

re.compile(r'[^abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789\!\"\#\$\%\&\\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^_\`\{\|\}\~]+',
re.UNICODE)

In [6]:
punctuation_pattern = re.compile('(\d+\.\d+|\w+\'\w+|[{0}]+(?=\w)|(?<=\w)[{0}]+|[{0}]+$)'.format(re.escape(string.punctuation)))
punctuation_pattern

re.compile(r'(\d+\.\d+|\w+\'\w+|[\!\"\#\$\%\&\\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^_\`\{\|\}\~]+(?=\w)|(?<=\w)[\!\"\#\$\%\&\\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^_\`\{\|\}\~]+|[\!\"\#\$\%\&\\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^_\`\{\|\}\~]+$)',
re.UNICODE)

In [7]:
re.split(r'(\(+(?=\w))', '((average')

['', '((', 'average']

In [8]:
[ch for gp in [re.split(punctuation_pattern, t) for t in re.split(token_pattern, small_text)]
    for ch in gp if ch]

['A',
 'guess',
 'is',
 'improved',
 'by',
 'averaging',
 'it',
 'with',
 'the',
 'quotient',
 'of',
 'the',
 'radicand',
 'and',
 'the',
 'old',
 'guess',
 ':',
 '(',
 'define',
 '(',
 'improve',
 'guess',
 'x',
 ')',
 '((',
 'average',
 'guess',
 '(/',
 'x',
 'guess',
 ')))']

In [9]:
[ch for gp in [re.split(punctuation_pattern, t) for t in re.split(token_pattern, sentence_boundary)]
    for ch in gp if ch]

['in',
 'terms',
 'of',
 'procedures',
 '.',
 'We',
 'start',
 'with',
 '0.123',
 'and',
 'some']

In [10]:
[ch for gp in [re.split(punctuation_pattern, t) for t in re.split(token_pattern, double_quotes)]
    for ch in gp if ch]

["let's", 'see', 'how', 'such', 'a', "''", 'circular', "''", 'definition']

In [11]:
[ch for gp in [re.split(punctuation_pattern, t) for t in re.split(token_pattern, sample_text)]
    for ch in gp if ch]

['Continuing',
 'this',
 'process',
 ',',
 'we',
 'obtain',
 'better',
 'and',
 'better',
 'approximations',
 'to',
 'the',
 'square',
 'root',
 '.',
 'Now',
 "let's",
 'formalize',
 'the',
 'process',
 'in',
 'terms',
 'of',
 'procedures',
 '.',
 'We',
 'start',
 'with',
 'a',
 'value',
 'for',
 'the',
 'radicand',
 '(',
 'the',
 'number',
 'whose',
 'square',
 'root',
 'we',
 'are',
 'trying',
 'to',
 'compute',
 ')',
 'and',
 'a',
 'value',
 'for',
 'the',
 'guess',
 '.',
 'If',
 'the',
 'guess',
 'is',
 'good',
 'enough',
 'for',
 'our',
 'purposes',
 ',',
 'we',
 'are',
 'done',
 ';',
 'if',
 'not',
 ',',
 'we',
 'must',
 'repeat',
 'the',
 'process',
 'with',
 'an',
 'improved',
 'guess',
 '.',
 'We',
 'write',
 'this',
 'basic',
 'strategy',
 'as',
 'a',
 'procedure',
 ':',
 '(',
 'define',
 '(',
 'sqrt',
 '-',
 'iter',
 'guess',
 'x',
 ')',
 '(',
 'if',
 '(',
 'good',
 '-',
 'enough',
 '?',
 'guess',
 'x',
 ')',
 'guess',
 '(',
 'sqrt',
 '-',
 'iter',
 '(',
 'improve',
 'guess'

In [12]:
def tokenise(text):
    return [ch for gp in [re.split(punctuation_pattern, t) for t in re.split(token_pattern, text)]
        for ch in gp if ch]

In [13]:
tokenise(sample_text)

['Continuing',
 'this',
 'process',
 ',',
 'we',
 'obtain',
 'better',
 'and',
 'better',
 'approximations',
 'to',
 'the',
 'square',
 'root',
 '.',
 'Now',
 "let's",
 'formalize',
 'the',
 'process',
 'in',
 'terms',
 'of',
 'procedures',
 '.',
 'We',
 'start',
 'with',
 'a',
 'value',
 'for',
 'the',
 'radicand',
 '(',
 'the',
 'number',
 'whose',
 'square',
 'root',
 'we',
 'are',
 'trying',
 'to',
 'compute',
 ')',
 'and',
 'a',
 'value',
 'for',
 'the',
 'guess',
 '.',
 'If',
 'the',
 'guess',
 'is',
 'good',
 'enough',
 'for',
 'our',
 'purposes',
 ',',
 'we',
 'are',
 'done',
 ';',
 'if',
 'not',
 ',',
 'we',
 'must',
 'repeat',
 'the',
 'process',
 'with',
 'an',
 'improved',
 'guess',
 '.',
 'We',
 'write',
 'this',
 'basic',
 'strategy',
 'as',
 'a',
 'procedure',
 ':',
 '(',
 'define',
 '(',
 'sqrt',
 '-',
 'iter',
 'guess',
 'x',
 ')',
 '(',
 'if',
 '(',
 'good',
 '-',
 'enough',
 '?',
 'guess',
 'x',
 ')',
 'guess',
 '(',
 'sqrt',
 '-',
 'iter',
 '(',
 'improve',
 'guess'

In [14]:
tokenise("""Exercise 1.8. Newton's method for cube roots is based on the fact that if y is an approximation to the
cube root of x, then a better approximation is given by the value

Use this formula to implement a cube-root procedure analogous to the square-root procedure. (In
section 1.3.4 we will see how to implement Newton's method in general as an abstraction of these
square-root and cube-root procedures.)

1.1.8 Procedures as Black-Box Abstractions
Sqrt is our first example of a process defined by a set of mutually defined procedures. Notice that the
definition of sqrt-iter is recursive; that is, the procedure is defined in terms of itself. The idea of
being able to define a procedure in terms of itself may be disturbing; it may seem unclear how such a
''circular'' definition could make sense at all, much less specify a well-defined process to be carried
""")

['Exercise',
 '1.8',
 '.',
 "Newton's",
 'method',
 'for',
 'cube',
 'roots',
 'is',
 'based',
 'on',
 'the',
 'fact',
 'that',
 'if',
 'y',
 'is',
 'an',
 'approximation',
 'to',
 'the',
 'cube',
 'root',
 'of',
 'x',
 ',',
 'then',
 'a',
 'better',
 'approximation',
 'is',
 'given',
 'by',
 'the',
 'value',
 'Use',
 'this',
 'formula',
 'to',
 'implement',
 'a',
 'cube',
 '-',
 'root',
 'procedure',
 'analogous',
 'to',
 'the',
 'square',
 '-',
 'root',
 'procedure',
 '.',
 '(',
 'In',
 'section',
 '1.3',
 '.',
 '4',
 'we',
 'will',
 'see',
 'how',
 'to',
 'implement',
 "Newton's",
 'method',
 'in',
 'general',
 'as',
 'an',
 'abstraction',
 'of',
 'these',
 'square',
 '-',
 'root',
 'and',
 'cube',
 '-',
 'root',
 'procedures',
 '.)',
 '1.1',
 '.',
 '8',
 'Procedures',
 'as',
 'Black',
 '-',
 'Box',
 'Abstractions',
 'Sqrt',
 'is',
 'our',
 'first',
 'example',
 'of',
 'a',
 'process',
 'defined',
 'by',
 'a',
 'set',
 'of',
 'mutually',
 'defined',
 'procedures',
 '.',
 'Notice',
 

In [15]:
def find_counts_of_item(item, counts, tuple_size):
    for i in range(len(item)-(tuple_size)):
        counts[tuple(item[i:i+tuple_size])].update([item[i+tuple_size]])
    counts[tuple(item[-tuple_size:])].update([None])
    return counts

In [16]:
def find_counts(items, tuple_size=2):
    counts = collections.defaultdict(collections.Counter)
    starts = collections.Counter()
    for item in items:
        counts = find_counts_of_item(item, counts, tuple_size)
        starts[tuple(item[:tuple_size])] += 1
    return starts, counts

In [17]:
def sentences(tokens):
    sents = []
    sent = []
    for i in range(len(tokens)):
        if tokens[i] == '.':
            sents += [sent + [tokens[i]]]
            sent = []
        else:
            sent += [tokens[i]]
    return sents

In [18]:
sentences(tokenise(sample_text))

[['Continuing',
  'this',
  'process',
  ',',
  'we',
  'obtain',
  'better',
  'and',
  'better',
  'approximations',
  'to',
  'the',
  'square',
  'root',
  '.'],
 ['Now',
  "let's",
  'formalize',
  'the',
  'process',
  'in',
  'terms',
  'of',
  'procedures',
  '.'],
 ['We',
  'start',
  'with',
  'a',
  'value',
  'for',
  'the',
  'radicand',
  '(',
  'the',
  'number',
  'whose',
  'square',
  'root',
  'we',
  'are',
  'trying',
  'to',
  'compute',
  ')',
  'and',
  'a',
  'value',
  'for',
  'the',
  'guess',
  '.'],
 ['If',
  'the',
  'guess',
  'is',
  'good',
  'enough',
  'for',
  'our',
  'purposes',
  ',',
  'we',
  'are',
  'done',
  ';',
  'if',
  'not',
  ',',
  'we',
  'must',
  'repeat',
  'the',
  'process',
  'with',
  'an',
  'improved',
  'guess',
  '.'],
 ['We',
  'write',
  'this',
  'basic',
  'strategy',
  'as',
  'a',
  'procedure',
  ':',
  '(',
  'define',
  '(',
  'sqrt',
  '-',
  'iter',
  'guess',
  'x',
  ')',
  '(',
  'if',
  '(',
  'good',
  '-',

In [19]:
one_s_starts, one_s_counts = find_counts([sentences(tokenise(sample_text))[0]])
one_s_starts, one_s_counts

(Counter({('Continuing', 'this'): 1}),
 defaultdict(collections.Counter,
             {(',', 'we'): Counter({'obtain': 1}),
              ('Continuing', 'this'): Counter({'process': 1}),
              ('and', 'better'): Counter({'approximations': 1}),
              ('approximations', 'to'): Counter({'the': 1}),
              ('better', 'and'): Counter({'better': 1}),
              ('better', 'approximations'): Counter({'to': 1}),
              ('obtain', 'better'): Counter({'and': 1}),
              ('process', ','): Counter({'we': 1}),
              ('root', '.'): Counter({None: 1}),
              ('square', 'root'): Counter({'.': 1}),
              ('the', 'square'): Counter({'root': 1}),
              ('this', 'process'): Counter({',': 1}),
              ('to', 'the'): Counter({'square': 1}),
              ('we', 'obtain'): Counter({'better': 1})}))

In [20]:
find_counts(sentences(tokenise(sample_text)), tuple_size=3)

(Counter({('(', 'See', 'exercise'): 1,
          ('Continuing', 'this', 'process'): 1,
          ('If', 'the', 'guess'): 1,
          ('Now', "let's", 'formalize'): 1,
          ('We', 'start', 'with'): 1,
          ('We', 'write', 'this'): 1}),
 defaultdict(collections.Counter,
             {("''", 'good', 'enough'): Counter({".''": 1}),
              ('(', 'See', 'exercise'): Counter({'1.7': 1}),
              ('(', 'abs', '(-'): Counter({'(': 1}),
              ('(', 'average', 'guess'): Counter({'(/': 1}),
              ('(', 'average', 'x'): Counter({'y': 1}),
              ('(',
               'define',
               '('): Counter({'average': 1,
                       'good': 1,
                       'improve': 1,
                       'sqrt': 1}),
              ('(', 'good', '-'): Counter({'enough': 2}),
              ('(', 'here', '0.001'): Counter({'):': 1}),
              ('(', 'if', '('): Counter({'good': 1}),
              ('(', 'improve', 'guess'): Counter({'x': 2}),
  

In [21]:
s = sentences(tokenise(sample_text))[0]
tuple(s[:3])

('Continuing', 'this', 'process')

In [22]:
unaccent_specials = ''.maketrans({"’": "'", "’": "'"})
def unaccent(text):
    """Remove all accents from letters.
    It does this by converting the unicode string to decomposed compatability
    form, dropping all the combining accents, then re-encoding the bytes.

    >>> unaccent('hello')
    'hello'
    >>> unaccent('HELLO')
    'HELLO'
    >>> unaccent('héllo')
    'hello'
    >>> unaccent('héllö')
    'hello'
    >>> unaccent('HÉLLÖ')
    'HELLO'
    """
    translated_text = text.translate(unaccent_specials)
    return unicodedata.normalize('NFKD', translated_text).\
        encode('ascii', 'ignore').\
        decode('utf-8')

In [23]:
sicp = unaccent(open('sicp.txt').read())
sicp_starts, sicp_counts = find_counts(sentences(tokenise(sicp)), tuple_size=3)

In [24]:
list(sicp_counts.items())[:10]

[(('the', 'controller', 'completely'), Counter({'describeof': 1})),
 (('equations', 'imply', 'that'), Counter({'(': 1})),
 (('important', 'point', 'to'), Counter({'note': 1, 'observe': 1})),
 (('separately', 'by', 'each'), Counter({'query': 1})),
 (('to', 'explore', 'variations'), Counter({'of': 1})),
 (('rock', 'songs', '.'), Counter({None: 1})),
 (('software', 'engineers', 'have'), Counter({'the': 1})),
 (('great', 'confusion', ','), Counter({'as': 1})),
 (('increasingly', 'elaborate', 'models'), Counter({'of': 1})),
 (('-', 'lambda1', '))'), Counter({'entry2': 2}))]

In [25]:
list(sicp_starts.items())[:10]

[(('29', 'There', 'is'), 1),
 (('In', 'such', 'an'), 1),
 (('To', 'answer', 'a'), 1),
 (('Y', '.'), 2),
 (('If', 'f', '('), 1),
 (('This', 'can', 'greatly'), 1),
 (('Design', 'a', 'machine'), 1),
 (('Exercise', '2.8', '.'), 1),
 (('Control', '(', 'how'), 1),
 (('C', '.'), 4)]

In [26]:
list(sicp_counts[('Gas', 'Meters', '.')].elements())

[None]

In [27]:
random.choice(list(sicp_starts.elements()))

('The', 'constructor', 'for')

In [28]:
t = ('as', '+')
t[1:] + ('or', )

('+', 'or')

In [29]:
def markov_item(starts, counts, max_len=None):
    valid_found = False
    while not valid_found:
        i = 0
        current = random.choice(list(starts.elements()))
        chain = list(current)
        next_item = random.choice(list(counts[current].elements()))
        while next_item and ((max_len and i < max_len) or not max_len):
            chain += [next_item]
            current = current[1:] + (next_item, )
            i += 1
            next_item = random.choice(list(counts[current].elements()))
            # print(chain, ':', current, ':', list(counts[current].elements()), ':', next_item)
        if max_len and i < max_len:
            valid_found = True
        if not max_len:
            valid_found = True
    return chain

In [30]:
' '.join(markov_item(sicp_starts, sicp_counts, 500))

'46 Alternatively , multiprocessing computers provide instructions that support atomic operations directly in hardware .'

In [31]:
sicp_counts['the', 'dispatch', 'procedure']

Counter({'.': 1})

In [32]:
' '.join(markov_item(one_s_starts, one_s_counts, 500))

'Continuing this process , we obtain better and better approximations to the square root .'

In [33]:
one_s_counts, one_s_starts

(defaultdict(collections.Counter,
             {(',', 'we'): Counter({'obtain': 1}),
              ('Continuing', 'this'): Counter({'process': 1}),
              ('and', 'better'): Counter({'approximations': 1}),
              ('approximations', 'to'): Counter({'the': 1}),
              ('better', 'and'): Counter({'better': 1}),
              ('better', 'approximations'): Counter({'to': 1}),
              ('obtain', 'better'): Counter({'and': 1}),
              ('process', ','): Counter({'we': 1}),
              ('root', '.'): Counter({None: 1}),
              ('square', 'root'): Counter({'.': 1}),
              ('the', 'square'): Counter({'root': 1}),
              ('this', 'process'): Counter({',': 1}),
              ('to', 'the'): Counter({'square': 1}),
              ('we', 'obtain'): Counter({'better': 1})}),
 Counter({('Continuing', 'this'): 1}))

In [34]:
def sentence_join(tokens):
    sentence = ''
    for t in tokens:
        if t[-1] not in ".,:;')-":
            sentence += ' '
        sentence += t
    return sentence.strip()

In [35]:
sentence_join(markov_item(sicp_starts, sicp_counts, 500))

'4, we can usually do better by taking advantage of additional structure that may be represented in two almost equivalent ways: and He has written the following two rules, we can find integers not divisible by 7 simply by accessing elements of this stream: ( define input- 1 input- 2 to 1 and allow the values to which they are listed.'

In [36]:
' '.join(sentence_join(markov_item(sicp_starts, sicp_counts, 500)) for _ in range(10))

"3. We model state with local state variables describing the actual object's state. Since ? x is bound in the frame. 61 Interest in logic programming peaked during the early 80s when the Japanese government began an ambitious project aimed at building superfast computers optimized to run logic programming languages. 4.1. We will compile the definition of f and start the machine, and so on, his modified eval will usually check fewer clauses than the original eval before identifying the type of the expression. This makes no difference in the values returned by the call to make- operation- exp- label dest)))) ( lambda ( pair) ( prime ? (+ ( car pair) ( cadr s))) ( set- signal ! input- 1 ( right- branch set)) ( element- of list2))) ( require ( null ? rest) result ( iter ( stream- cdr, and so will work with a system that performs arithmetic operations on complex numbers and ordinary numbers should be the sequence ( enumerate- interval stream- filter examines the stream- car s) ( if ( not ( 

In [37]:
kjb = unaccent(open('king-james-bible.txt').read())
kjb_starts, kjb_counts = find_counts(sentences(tokenise(kjb)), tuple_size=3)

In [38]:
' '.join(sentence_join(markov_item(kjb_starts, kjb_counts, 500)) for _ in range(10))

'12: 1 Help, LORD; for thy God helpeth thee. 66: 8 O bless our God, and that ye might fear the LORD from this time forth and for evermore. 9: 30 If there be laid on him a scarlet robe. 16: 38 The son of Amzi, the son of Micah, as he hath said, and be dandled upon her knees. 4: 12 And David commanded to gather together the strangers that came out of the land, saying, Because I drew him out of the temple which was in Bethlehem. 3: 24 Then he is gracious, and will give ten tribes to thee: We have such an high priest became us, who knew no sin; that we should die in the pit, and his oath unto Isaac; 105: 15 Saying, Touch not mine anointed, and be astonied one with another what they might do to Jesus. 10: 8 Neither let us commit fornication, as some of them committed, and of the south shall come into the land which I sware unto Abraham, As for Sarai thy wife, and upon the great toe of his right foot, upon the four corners of one base: and the truth shall make you free. 8: 15 We who are Jews

In [39]:
all_starts = sicp_starts + kjb_starts

In [40]:
list(all_starts.items())[:20]

[(('119', ':', '112'), 1),
 (('29', 'There', 'is'), 1),
 (('To', 'answer', 'a'), 1),
 (('Y', '.'), 2),
 (('36', ':', '36'), 3),
 (('Ye', 'shall', 'seek'), 1),
 (('And', 'in', 'like'), 1),
 (('If', 'f', '('), 1),
 (('3', ':', '47'), 1),
 (('27', ':', '24'), 7),
 (('26', ':', '28'), 5),
 (('Control', '(', 'how'), 1),
 (('16', ':', '43'), 3),
 (('They', 'should', 'be'), 1),
 (('139', ':', '9'), 1),
 (('So', 'he', 'drew'), 1),
 (('If', 'the', 'symbol'), 1),
 (('RC', 'should', 'take'), 1),
 (('?', 'type', ')'), 3),
 (('How', 'does', 'the'), 1)]

In [41]:
all_counts = collections.defaultdict(collections.Counter)
for k in sicp_counts:
    all_counts[k] = sicp_counts[k].copy()
for k in kjb_counts:
    all_counts[k] += kjb_counts[k].copy()

In [42]:
' '.join(sentence_join(markov_item(sicp_starts, sicp_counts, 500)) for _ in range(10))

"In becoming an expert programmer, just as our embedded Lisp evaluator uses primitives and control structure from the underlying Scheme system to perform arithmetic with rational numbers. What are these constants ? Similarly, find the ratios of the stack required to compute n ! by specifying that we first multiply 1 by 2, 3, 6, 10, 15,.... Exercise 3.56. Describe what kind of information ( patterns and frames) is included in this history, and how abstraction preserves for us the flexibility to consider alternate implementations. When evaluation is complete, x will be 1, y will be 2, 3 } could be represented as a pair of numbers: the x coordinate and the y coordinate. 43- 112. The timing diagram in figure 3.29, where Peter changes the account balance between the times when Paul accesses the account only very rarely. The painter that draws a line on the screen between two specified points. ( save continue) ( save n); save factorial procedure Figure 5.17: Compilation of the definition of 

In [43]:
' '.join(sentence_join(markov_item(kjb_starts, kjb_counts, 500)) for _ in range(10))

'8: 16 ( For six months did Joab remain there with all Israel, and he shall serve: I have not shewed them. 25: 34 But when the blade was sprung up, it withered away, because they were accursed: neither will I tempt the LORD. 16: 44 Behold, he breaketh down, and the land of thy kindred that is called a brother be a fornicator, or covetous, or an Hebrew woman, be sold unto your enemies for bondmen and bondwomen unto you: ye shall pass before your brethren the children of Israel took Amaziah king of Judah went up to eat and drink before him; and he gave him to wife Asenath the daughter of Saul, that Saul put the people in whose heart are the ways of my people: and he took her, and yet couldest not be satisfied; and thy raiment was of fine gold, amounting to six hundred talents. 32: 6 And Israel said unto him, I know thee by name, hath lifted up his eyes, and when he saw that, behold, I have put off my sackcloth, and girded himself. 28: 9 But the wise took oil in their vessels with their l

In [44]:
' '.join(sentence_join(markov_item(all_starts, all_counts, 500)) for _ in range(10))

"7: 3 And when I rose in the morning as he returned into the host, and went forth before them all, saying, Behold, I have brought him forth abroad, and said, Thy servant Uriah the Hittite: thirty and seven thousand. 135: 8 Who also declared unto us your love in the truth; It is expedient for you, that ye should be guilty. 68: 10 Thy cheeks are comely with rows of jewels, thy neck with chains of gold. 2: 25 And he set up the horn. 19: 14 My kinsfolk have failed, and my people love to have it return 0. 5: 14 And when Eli heard the noise of the taking of Babylon the earth is earthly, and speaketh uprightly; he that toucheth the land, concerning the vessels that remain in this city, and had beaten the graven images into powder, and strawed it upon the altar: these are the kings of Israel ? 16: 10 And I took the little book. 17: 8 Upright men shall be made smooth; 3: 2 And the Lord make you to increase and abound in love one toward another; men with men working that which is sold shall rema

In [45]:
all2 = unaccent(open('sicp-trimmed.txt').read() + open('king-james-bible.txt').read())
all2_starts, all2_counts = find_counts(sentences(tokenise(all2)), tuple_size=2)

In [46]:
[sentence_join(markov_item(all2_starts, all2_counts, 500)) for _ in range(30)]

['Typical memory systems provide a driver loop.',
 '2: 10 Which doeth great wonders, ye were sealed twelve thousand.',
 "59: 13 ( 3 4)) ( fib n) ( newline)) We must evaluate (* x y) (/ (+ x 4))) x) ( let (( avpt (/ (+ ( sum term a) ( registers- needed seq1)) ( branch ( label ev- sequence '( proc argl continue)); linkage code machine- model > < value>. Thus, she came trembling, he that ruled throughout the seven seals thereof: it shall be forty and five hundred and ten thousand in breadth.",
 '9: 22 And the chief captain that he shall do my prophets no harm: 35 And the firstborn, Jehush the second time the LORD: that I visit them, If thou cast down, even since the assembler to store the current value of the sword, and to which denominators.',
 'And he blessed Joseph, saying, They hated me, Son of man: preserve my life, or a product of the mountain which is escaped ? And the children of Israel went thither a whoring after their tongues, in the dispatch as in a 1954 paper that essentially

In [47]:
sicp_lovecraft = unaccent(open('sicp-trimmed.txt').read() + open('lovecraft.txt').read())
sl2_starts, sl2_counts = find_counts(sentences(tokenise(sicp_lovecraft)), tuple_size=2)

In [48]:
" ".join(sentence_join(markov_item(sl2_starts, sl2_counts, 500)) for _ in range(10))

'But on the moon shone down cold through the unseen tumbler meant a danger not to be increasing in vividness and darkened with dread of opening it or descend the wide appearance of this treatment of not and lisp- value rather than practical examples in this decrepit edifice. There were gods and the receiver, and shuddered. Bothersome forms, and the hellish image; but it is worth correcting. Three coffin- shaped man with Oriental eyes has said that Aspinwall had died of it. With the extra lens I could not have written the following procedures. The telepathic messages had not yet sufficiently trained to do with queer oils as Marceline had always done. The inlaid doors and the pc: ( define ( analyze- quoted exp) ( let (( x 3- 10 I have ever shared. Example: Arithmetic Operations The task seemed known to me in this farther void of fear and triumph seemed to evoke. Only after such a loathsome cost, and the awful concept of a branch instruction at the ends of the equator had been there and e

In [49]:
display(HTML('<p>' + 
             " ".join(sentence_join(markov_item(sl2_starts, sl2_counts, 500)) for _ in range(10)) + 
             '</h1>'))

In [57]:
sum(sicp_starts.values()), sum(sum(c.values()) for c in sicp_counts.values())

(8177, 249224)

In [58]:
sum(kjb_starts.values()), sum(sum(c.values()) for c in kjb_counts.values())

(26374, 958228)

In [91]:
lovecraft = unaccent(open('lovecraft-trimmed.txt').read())
lovecraft_starts, lovecraft_counts = find_counts(sentences(tokenise(lovecraft)), tuple_size=2)
sum(lovecraft_starts.values()), sum(sum(c.values()) for c in lovecraft_counts.values())

(24560, 683961)

In [92]:
sum(kjb_starts.values()) / sum(sicp_starts.values())

3.225388284211814

In [93]:
sum(sum(c.values()) for c in kjb_counts.values()) / sum(sum(c.values()) for c in sicp_counts.values())

3.8448464032356435

In [94]:
sum(lovecraft_starts.values()) / sum(sicp_starts.values()), \
sum(sum(c.values()) for c in lovecraft_counts.values()) / sum(sum(c.values()) for c in sicp_counts.values())

(3.0035465329582975, 2.7443625012037365)

In [95]:
def scale_merge(left_starts, left_start_scale, left_counts, left_count_scale, 
                right_starts, right_start_scale, right_counts, right_count_scale):
    starts = collections.Counter()
    counts = collections.defaultdict(collections.Counter)
    
    for k, n in left_starts.items():
        starts[k] = n * left_start_scale
    for k, n in right_starts.items():
        starts[k] += n * right_start_scale
    
    for k in left_counts:
        for j in left_counts[k]:
            counts[k][j] = left_counts[k][j] * left_count_scale
            
    for k in right_counts:
        for j in right_counts[k]:
            counts[k][j] = right_counts[k][j] * right_count_scale

    return starts, counts

In [96]:
sk_starts, sk_counts = scale_merge(sicp_starts, 3, sicp_counts, 4, kjb_starts, 1, kjb_counts, 1)

In [97]:
display(HTML(cat('<p>' + sentence_join(markov_item(sk_starts, sk_counts, 500)) + '</p>' for _ in range(10))))

In [98]:
sl_starts, sl_counts = scale_merge(sicp_starts, 3, sicp_counts, 2, lovecraft_starts, 1, lovecraft_counts, 1)

In [99]:
display(HTML(cat('<p>' + sentence_join(markov_item(sl_starts, sl_counts, 500)) + '</p>' for _ in range(10))))

In [100]:
lk_starts, lk_counts = scale_merge(lovecraft_starts, 1, lovecraft_counts, 1, kjb_starts, 1, kjb_counts, 1)

In [101]:
display(HTML(cat('<p>' + sentence_join(markov_item(lk_starts, lk_counts, 500)) + '</p>' for _ in range(10))))