<h2>Markov Model (Natrural Language Processing)</h2>

In [2]:
import numpy as np
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('punkt')
import random
import pandas
import time
import os
import re

[nltk_data] Downloading package punkt to /home/sam/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
path = "/home/sam/projects/machine-learning/data/sherlock/sherlock/"

def read_books(path):
    stories = []
    for _, _, files in os.walk(path):
        for file in files:
            with open(path + file) as f:
                for line in f:
                    line = line.strip()
                    if line == '----------':
                        break
                    if line != '':
                        stories.append(line)
    return stories

In [4]:
stories = read_books(path)

In [5]:
print(len(stories))

215021


In [6]:
print(stories[:30])

['THE RETURN OF SHERLOCK HOLMES', 'Arthur Conan Doyle', 'Table of contents', 'The Adventure of the Empty House', 'The Adventure of the Norwood Builder', 'The Adventure of the Dancing Men', 'The Adventure of the Solitary Cyclist', 'The Adventure of the Priory School', 'The Adventure of Black Peter', 'The Adventure of Charles Augustus Milverton', 'The Adventure of the Six Napoleons', 'The Adventure of the Three Students', 'The Adventure of the Golden Pince-Nez', 'The Adventure of the Missing Three-Quarter', 'The Adventure of the Abbey Grange', 'The Adventure of the Second Stain', 'THE ADVENTURE OF THE EMPTY HOUSE', 'It was in the spring of the year 1894 that all London was interested,', 'and the fashionable world dismayed, by the murder of the Honourable', 'Ronald Adair under most unusual and inexplicable circumstances. The', 'public has already learned those particulars of the crime which came', 'out in the police investigation; but a good deal was suppressed upon', 'that occasion, sinc

In [7]:
def clean_stories(stories):
    
    cleaned_stories = []
    
    for line in stories:
        line = line.lower()
        line = re.sub(r"[,.\"\'!@#$%^&*(){}?/;`~:<>+=-\\]", "", line)
        tokens = word_tokenize(line)
        words = [word for word in tokens if word.isalpha()]
        cleaned_stories += words
    return cleaned_stories


In [8]:
tokens = clean_stories(stories)

In [10]:
print(tokens[:100])

['the', 'return', 'of', 'sherlock', 'holmes', 'arthur', 'conan', 'doyle', 'table', 'of', 'contents', 'the', 'adventure', 'of', 'the', 'empty', 'house', 'the', 'adventure', 'of', 'the', 'norwood', 'builder', 'the', 'adventure', 'of', 'the', 'dancing', 'men', 'the', 'adventure', 'of', 'the', 'solitary', 'cyclist', 'the', 'adventure', 'of', 'the', 'priory', 'school', 'the', 'adventure', 'of', 'black', 'peter', 'the', 'adventure', 'of', 'charles', 'augustus', 'milverton', 'the', 'adventure', 'of', 'the', 'six', 'napoleons', 'the', 'adventure', 'of', 'the', 'three', 'students', 'the', 'adventure', 'of', 'the', 'golden', 'the', 'adventure', 'of', 'the', 'missing', 'the', 'adventure', 'of', 'the', 'abbey', 'grange', 'the', 'adventure', 'of', 'the', 'second', 'stain', 'the', 'adventure', 'of', 'the', 'empty', 'house', 'it', 'was', 'in', 'the', 'spring', 'of', 'the', 'year']


In [11]:
print(len(tokens))

2332247


In [12]:
def build_markov_model(tokens, n_gram=2):
    
    markov_model = {}
    
    for i in range(len(tokens) - n_gram-1):
        
        curr_state = ""
        next_state = ""
        
        for j in range(n_gram):
            
            curr_state += tokens[i+j] + " "
            next_state += tokens[i+j+n_gram] + " "
            
        curr_state = curr_state[:-1]
        next_state = next_state[:-1]
        
        if curr_state not in markov_model:
            
            markov_model[curr_state] = {}
            markov_model[curr_state][next_state] = 1
        else:
            if next_state in markov_model[curr_state]:
                markov_model[curr_state][next_state] += 1
            else:
                markov_model[curr_state][next_state] = 1
    
    # calculating transition probabilities
    for curr_state, transition in markov_model.items():
        total = sum(transition.values())
        for state, count in transition.items():
            markov_model[curr_state][state] = count/total
        
    return markov_model

In [13]:
markov_model = build_markov_model(tokens=tokens, n_gram=2)

In [14]:
print(list(markov_model.items())[:5])

[('the return', {'of sherlock': 0.2, 'of lady': 0.16, 'track we': 0.16, 'and his': 0.08, 'to hatherley': 0.16, 'of the': 0.12, 'of our': 0.12}), ('return of', {'sherlock holmes': 0.2777777777777778, 'lady maynooth': 0.2222222222222222, 'his truculent': 0.16666666666666666, 'the new': 0.16666666666666666, 'our colleagues': 0.16666666666666666}), ('of sherlock', {'holmes arthur': 0.08620689655172414, 'holmes there': 0.06896551724137931, 'holmes was': 0.13793103448275862, 'holmes it': 0.05172413793103448, 'holmes he': 0.034482758620689655, 'holmes requests': 0.06896551724137931, 'holmes a': 0.06896551724137931, 'holmes silver': 0.06896551724137931, 'holmes the': 0.1896551724137931, 'holmes upon': 0.06896551724137931, 'everywhere since': 0.06896551724137931, 'holmess defects': 0.05172413793103448, 'holmes preface': 0.034482758620689655}), ('sherlock holmes', {'arthur conan': 0.005143277002204262, 'had interested': 0.0029390154298310064, 'there were': 0.005878030859662013, 'was standing': 0

In [15]:
print(len(markov_model.keys()))

208714


In [16]:
print(markov_model["sherlock holmes"])

{'arthur conan': 0.005143277002204262, 'had interested': 0.0029390154298310064, 'there were': 0.005878030859662013, 'was standing': 0.0029390154298310064, 'not so': 0.0029390154298310064, 'is free': 0.0029390154298310064, 'london has': 0.0029390154298310064, 'was leaning': 0.0029390154298310064, 'listened with': 0.00881704628949302, 'said he': 0.00881704628949302, 'you may': 0.008082292432035268, 'took any': 0.0029390154298310064, 'the constables': 0.0029390154298310064, 'very thoughtful': 0.0029390154298310064, 'preserved his': 0.0029390154298310064, 'you remember': 0.0029390154298310064, 'gave his': 0.0029390154298310064, 'in fact': 0.0029390154298310064, 'i only': 0.0029390154298310064, 'was a': 0.014695077149155033, 'good lord': 0.0029390154298310064, 'had listened': 0.005878030859662013, 'to undertake': 0.0029390154298310064, 'can return': 0.0029390154298310064, 'left the': 0.0029390154298310064, 'you will': 0.0029390154298310064, 'smiled demurely': 0.0029390154298310064, 'would b

In [17]:
sherlock_poss = sorted(list(markov_model["sherlock holmes"].values()))

In [18]:
print(sherlock_poss)

[0.0007347538574577516, 0.0007347538574577516, 0.0014695077149155032, 0.0014695077149155032, 0.0014695077149155032, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.002204261572373255, 0.00

In [23]:
def generate_story(markov_model, limit=100, start='my god'):
    n = 0
    curr_state = start
    next_state = None
    story = ""
    story+=curr_state+" "
    while n < limit:
        next_state = random.choices(list(markov_model[curr_state].keys()),
                                    list(markov_model[curr_state].values()))
        
        curr_state = next_state[0]
        story+=curr_state+" "
        n+=1
    return story

In [27]:
story1 = generate_story(markov_model, 200, "sherlock holmes")

In [28]:
print(story1)

sherlock holmes a few good cases to his powers upon the violin these were private grounds however or country was more than i could have wept over the two of the young lady in the blue carbuncle which appeared to me to give him away and the grotesque inconceivable nature of the sudden death of cardinal tosca an inquiry which can bear holmes and his eyes rested thoughtfully upon the subject we guard our secret very jealously however and if you had come now dr sterndale surely the inference is plain you mean that it had not penetrated the fact that the odd trick is in new zealand consolidated a hundred and twenty pounds in a lifetime i offered the advantages which london then and you will reach victoria in the morning you need not draw the blind he was clearly so scared by his mischance in breaking entirely with the murderer could roam london on their way once more in the district and had once slumbered the ashes of my pipe on the top of the path and we were at the house and could not hav

In [31]:
story2 = generate_story(markov_model, 200, "my god")

In [32]:
print(story2)

my god its watson said he pulling up her veil and dropped them away they are a fierce excitement behind his spectacles pushed up on to the roof had fallen upon our shoulders at any moment be seized and searched the stairs and he was shaking his fists if he found it difficult to tell the truth lucy it isnt likely id give it they dont have days of fog in the storm of the night of his matted hair and once in the house he beat his head with a smile quivered on your lips set your foot over that threshold again here in an instant watson you will you let me share this adventure with you in the following year he retired some eleven years ago and we returned to the main facts of great importance it excludes the idea of a man in so clumsy a fashion but a deal of business in the crypt and then i will not look with suspicious and sidelong eyes she had seen he was a very safe place when the offer was accepted and i talked it over there is something in it but above all no violence gentlemen no viole

In [33]:
story3 = generate_story(markov_model, 20, "watson was")

In [35]:
print(story3)

watson was with me oh yes easily the rest you must yourself have formed a theory about how a miners camp had been attacked ive always been on the staff of the kennington road but he came back to europe and took 
