In [11]:
import re 
import urllib2 
import graphlab as gl
import nltk

BASE_DIR = "/Users/matthewkrey/anaconda2/DATO/sherlock" # match BASE_DIR to your local directory path

books_url = "http://sherlock-holm.es/ascii/"
re_books_links = re.compile("\"piwik_download\"\s+href=\"(?P<link>.*?)\">(?P<title>.*?)</a>", re.MULTILINE)
html = urllib2.urlopen(books_url).read()
books_list = [m.groupdict() for m in re_books_links.finditer(html)]

# Filter books due to copyright issues. In this code, we filtered "The Complete Canon", “Case-Book of Sherlock Holmes” books, and
# "The Canon — U.S. edition" book (For more information please read the note above).

filtered_books = set(["The Complete Canon", "The Case-Book of Sherlock Holmes", "The Canon — U.S. edition" ])
books_list = filter(lambda d: d['title'] not in filtered_books, books_list)

# Download books' texts (to not overload the website we download the text in batch and not in parallel)
for d in books_list:
    d['text'] = urllib2.urlopen("http://sherlock-holm.es" + d['link']).read().strip()
    
sf = gl.SFrame(books_list).unpack("X1", column_name_prefix="")
sf.save("%s/books.sframe" % BASE_DIR)
sf.head(3)


link,text,title
/stories/plain- text/advs.txt ...,THE ADVENTURES OF SHERLOCK HOLMES\n\n ...,The Adventures of Sherlock Holmes ...
/stories/plain- text/mems.txt ...,THE MEMOIRS OF SHERLOCK HOLMES\n\n ...,The Memoirs of Sherlock Holmes ...
/stories/plain- text/retn.txt ...,THE RETURN OF SHERLOCK HOLMES\n\n ...,The Return of Sherlock Holmes ...


In [12]:
gl.canvas.set_target('ipynb')

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def txt2sentences(txt, remove_none_english_chars=True):
    """
    Split the English text into sentences using NLTK
    :param txt: input text.
    :param remove_none_english_chars: if True then remove none English chars from text
    :return: string in which each line consists of single sentence from the original input text.
    :rtype: str
    """
    txt = txt.decode("utf8")
    # split text into sentences using nltk packages 
    for s in tokenizer.tokenize(txt):
        if remove_none_english_chars:
            # remove none English chars
            s = re.sub("[^a-zA-Z]", " ", s)
        yield s 
        
sf = gl.load_sframe("%s/books.sframe" % BASE_DIR)
sf['sentences'] = sf['text'].apply(lambda t: list(txt2sentences(t)))

In [13]:
sf_sentences = sf.flat_map(['title', 'text'], lambda t: [[t['title'],s.strip()] for s in txt2sentences(t['text'])])
sf_sentences = sf_sentences.rename({'text': 'sentence'})
re_words_split = re.compile("(\w+)")

# split each sentence into words 
sf_sentences['words'] = sf_sentences['sentence'].apply(lambda s:re_words_split.findall(s))
sf_sentences.save("%s/sentences.sframe" % BASE_DIR)
sf_sentences.head(3)

title,sentence,words
The Adventures of Sherlock Holmes ...,THE ADVENTURES OF SHERLOCK HOLMES ...,"[THE, ADVENTURES, OF, SHERLOCK, HOLMES, Art ..."
The Adventures of Sherlock Holmes ...,I have seldom heard him mention her under any ...,"[I, have, seldom, heard, him, mention, her, un ..."
The Adventures of Sherlock Holmes ...,In his eyes she eclipses and predominates ...,"[In, his, eyes, she, eclipses, and, ..."
