In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import os
import re
import requests
import time

In [3]:
from project import *

Retrieves contents of Plain Text UTF-8 file.

In [4]:
url = 'http://www.gutenberg.org/files/57988/57988-0.txt'
book = get_book(url)

In [5]:
book[:1000]

'\n\n\n\n\nProduced by Chuck Greif and the Online Distributed\nProofreading Team at http://www.pgdp.net (This file was\nproduced from images available at The Internet Archive)\n\n\n\n\n\n\n\n\n\n\n\n                                   A\n\n                           SON OF THE SOIL.\n\n\n\n\n                                   A\n\n                           SON OF THE SOIL.\n\n\n                                  BY\n\n                            MRS. OLIPHANT.\n\n\n                            _NEW EDITION._\n\n\n                                London:\n\n                           MACMILLAN AND CO.\n\n                                 1872.\n\n       _The Right of Translation and Reproduction is reserved._\n\n\n\n\n                                LONDON:\n\n                 R. CLAY, SONS, AND TAYLOR, PRINTERS,\n\n                           BREAD STREET HILL\n\n\n\n\n                          A SON OF THE SOIL.\n\n\n\n\n_CHAPTER I._\n\n\nâ\x80\x9cI say, you boy, it always rains here, does

Tokenizes text into legible tokens, with '\x02' meaning start and '\x03' meaning end.

In [6]:
tokens = tokenize(book)
tokens[:10], tokens[-10:]

(['\x02',
  'Produced',
  'by',
  'Chuck',
  'Greif',
  'and',
  'the',
  'Online',
  'Distributed',
  'Proofreading'],
 ['of', 'the', 'Soil', ',', 'by', 'Mrs', '.', 'Margaret', 'Oliphant', '\x03'])

Basic uniform model assumes the probability of all words are likely equal, no matter the frequency of said word.

In [7]:
uniform = UniformLM(tokens)

In [8]:
pd.DataFrame({'probability': uniform.mdl})

Unnamed: 0,probability
,0.000093
Produced,0.000093
by,0.000093
Chuck,0.000093
Greif,0.000093
...,...
Gutenberg,0.000093
',0.000093
Soil,0.000093
Margaret,0.000093


In [9]:
uniform.sample(100)

'blandly rank amity hereditary unknown unyielding cursed assertion body thick each golden thank printshop sanctuaries tongue woo emerging couldna plough humiliating entered stables neighbour prospects their off disappeared continuous matron fence broke cherished counts Arturo accompaniment place opinions spiteful blind mignonne tattoo nearer quarrels timidity virtues roamed rearranged Again ladies approve thunderstorm jealous ! blotted regular snowy September chilled shaky image clock fain strictest bought reposing question pleading impostor greatest fabric civilized indiscretion smother It manifold Park Argus healthful case literary headed niche futility strenuous tenth doctrine medicine murderer represent shaky provision hierarchies meantime wi Ã HARRY decline undesired soaring'

Unigram Model makes probability of choosing a specific token based solely on token frequency.

In [10]:
unigram = UnigramLM(tokens)

In [11]:
pd.DataFrame({'probability': unigram.mdl})

Unnamed: 0,probability
,0.005200
Produced,0.000004
by,0.002992
Chuck,0.000004
Greif,0.000004
...,...
Gutenberg,0.000004
',0.000004
Soil,0.000004
Margaret,0.000004


The sample will make more sense than the previous one.

In [12]:
unigram.sample(100)

'What meet which his helpless least something thought . \x80 away perhaps \x80 you bank what keeping . yet him and , accounted us attainable this happy . an \x9c imagination to a the \x80 time to news was been . on to get and or \x9d ? conscious man you , ; rest responses is him \x80 if fading , and your ; under on grand , recognized serve matter said which much his feared not arm the chair it the - , \x9d \x99 doubtful the this at \x99 take which for very mean way you so any'

Finally using the NGram to create Bigram and Trigram, which uses conditional probability to choose the succeeding token(s) based off of the preceding token(s).

In [13]:
bigram = NGramLM(3, tokens)

In [14]:
bigram.mdl

Unnamed: 0,ngram,n1gram,prob
0,"(, Produced, by)","(, Produced)",1.000000
1,"(Produced, by, Chuck)","(Produced, by)",1.000000
2,"(by, Chuck, Greif)","(by, Chuck)",1.000000
3,"(Chuck, Greif, and)","(Chuck, Greif)",1.000000
4,"(Greif, and, the)","(Greif, and)",1.000000
...,...,...,...
167480,"(,, by, Mrs)","(,, by)",0.018519
167481,"(by, Mrs, .)","(by, Mrs)",1.000000
167482,"(Mrs, ., Margaret)","(Mrs, .)",0.010989
167483,"(., Margaret, Oliphant)","(., Margaret)",1.000000


In [15]:
bigram.sample(100)

'\x02 MRS . OLIPHANT . = \x03 \x02 Upon which , to say , you can call on me again , since you say , for goodness sake , but by this time had almost been capable of introducing their sombre shadows into the drawing - room , where there was a very different circumstances , â \x80 \x99 s mother understood what he meant to be competed for immediately after Christmas , â \x80 \x9c one might choose to be talked about , as most people . He knew that Colin regarded this scene , the more closely connected \x03'

Now with a Trigram!

In [16]:
trigram = NGramLM(3, tokens)

In [17]:
trigram.mdl

Unnamed: 0,ngram,n1gram,prob
0,"(, Produced, by)","(, Produced)",1.000000
1,"(Produced, by, Chuck)","(Produced, by)",1.000000
2,"(by, Chuck, Greif)","(by, Chuck)",1.000000
3,"(Chuck, Greif, and)","(Chuck, Greif)",1.000000
4,"(Greif, and, the)","(Greif, and)",1.000000
...,...,...,...
167480,"(,, by, Mrs)","(,, by)",0.018519
167481,"(by, Mrs, .)","(by, Mrs)",1.000000
167482,"(Mrs, ., Margaret)","(Mrs, .)",0.010989
167483,"(., Margaret, Oliphant)","(., Margaret)",1.000000


In [18]:
trigram.sample(100)

'\x02 Lauderdale , â \x80 \x9c and the thanksgivings . I hope you â \x80 \x9d said the curate stopped speaking . He talks of going to Eetaly , and took out the big cradle with a skeleton in it , â \x80 \x99 to speak of private devotions , â \x80 \x9c I â \x80 \x99 s laugh sounded like mockery , holding it on the dear heart of Rome , where so many changes , and which even Colin , telling her her voice . \x03 \x02 â \x80 \x99 s ear like the sweetest little woman of \x03'

In [19]:
fivegram = NGramLM(5, tokens)

In [20]:
fivegram.mdl

Unnamed: 0,ngram,n1gram,prob
0,"(, Produced, by, Chuck, Greif)","(, Produced, by, Chuck)",1.0
1,"(Produced, by, Chuck, Greif, and)","(Produced, by, Chuck, Greif)",1.0
2,"(by, Chuck, Greif, and, the)","(by, Chuck, Greif, and)",1.0
3,"(Chuck, Greif, and, the, Online)","(Chuck, Greif, and, the)",1.0
4,"(Greif, and, the, Online, Distributed)","(Greif, and, the, Online)",1.0
...,...,...,...
230458,"(the, Soil, ,, by, Mrs)","(the, Soil, ,, by)",1.0
230459,"(Soil, ,, by, Mrs, .)","(Soil, ,, by, Mrs)",1.0
230460,"(,, by, Mrs, ., Margaret)","(,, by, Mrs, .)",1.0
230461,"(by, Mrs, ., Margaret, Oliphant)","(by, Mrs, ., Margaret)",1.0


In [21]:
fivegram.sample(100)

'\x02 â \x80 \x9c Oh , an honourable occupation , â \x80 \x9d continued big Colin , looking admiringly at the comely mother of his boys . â \x80 \x9c Ay , â \x80 \x9d he repeated , holding out his hand another time . â \x80 \x9c Eh , Colin , laddie , if that was possible . When I hear the poor people here singing their vespers - - - - â \x80 \x9d \x03 \x02 â \x80 \x9c I don â \x80 \x99 t stand there in the fog like a ghost ; if you have anything \x03'