In [2]:
import re 
import urllib2 
import graphlab as gl

BASE_DIR = "/Users/matthewkrey/anaconda2/DATO/sherlock" # match BASE_DIR to your local directory path

books_url = "http://sherlock-holm.es/ascii/"
re_books_links = re.compile("\"piwik_download\"\s+href=\"(?P<link>.*?)\">(?P<title>.*?)</a>", re.MULTILINE)
html = urllib2.urlopen(books_url).read()
books_list = [m.groupdict() for m in re_books_links.finditer(html)]
print books_list

[{'link': '/stories/plain-text/cano.txt', 'title': 'The Complete Canon'}, {'link': '/stories/plain-text/cnus.txt', 'title': 'The Canon \xe2\x80\x94 U.S. edition'}, {'link': '/stories/plain-text/advs.txt', 'title': 'The Adventures of Sherlock Holmes'}, {'link': '/stories/plain-text/mems.txt', 'title': 'The Memoirs of Sherlock Holmes'}, {'link': '/stories/plain-text/retn.txt', 'title': 'The Return of Sherlock Holmes'}, {'link': '/stories/plain-text/lstb.txt', 'title': 'His Last Bow'}, {'link': '/stories/plain-text/case.txt', 'title': 'The Case-Book of Sherlock Holmes'}, {'link': '/stories/plain-text/stud.txt', 'title': 'A Study In Scarlet'}, {'link': '/stories/plain-text/sign.txt', 'title': 'The Sign of the Four'}, {'link': '/stories/plain-text/houn.txt', 'title': 'The Hound of the Baskervilles'}, {'link': '/stories/plain-text/vall.txt', 'title': 'The Valley of Fear'}, {'link': '/stories/plain-text/scan.txt', 'title': 'A Scandal in Bohemia'}, {'link': '/stories/plain-text/redh.txt', 'tit

In [3]:
# Filter books due to copyright issues. In this code, we filtered "The Complete Canon", “Case-Book of Sherlock Holmes” books, and
# "The Canon — U.S. edition" book (For more information please read the note above).

filtered_books = set(["The Complete Canon", "The Case-Book of Sherlock Holmes", "The Canon - U.S. edition" ])
books_list = filter(lambda d: d['title'] not in filtered_books, books_list)

# Download books' texts (to not overload the website we download the text in batch and not in parallel)
for d in books_list:
    d['text'] = urllib2.urlopen("http://sherlock-holm.es" + d['link']).read().strip()

In [4]:
# Load list of dictionaries containing text of interest into SFrame

sf = gl.SFrame(books_list).unpack("X1", column_name_prefix="")
sf.save("%s/books.sframe" % BASE_DIR)
sf.head(3)

[INFO] This non-commercial license of GraphLab Create is assigned to matthew.krey@flatironschool.com and will expire on December 14, 2016. For commercial licensing options, visit https://dato.com/buy/.

[INFO] Start server at: ipc:///tmp/graphlab_server-43356 - Server binary: /Users/matthewkrey/anaconda2/envs/dato-env/lib/python2.7/site-packages/graphlab/unity_server - Server log: /tmp/graphlab_server_1452983201.log
[INFO] GraphLab Server Version: 1.8


link,text,title
/stories/plain- text/cnus.txt ...,THE COMPLETE SHERLOCK HOLMES\n\n ...,The Canon — U.S. edition ...
/stories/plain- text/advs.txt ...,THE ADVENTURES OF SHERLOCK HOLMES\n\n ...,The Adventures of Sherlock Holmes ...
/stories/plain- text/mems.txt ...,THE MEMOIRS OF SHERLOCK HOLMES\n\n ...,The Memoirs of Sherlock Holmes ...


In [6]:
# With SFrame now "loaded", we can leverage Python stdlib for stats & visualizations

# stats 
sf['chars_num'] = sf['text'].apply(lambda t: len(t))
sf.head(3)

# visualization 
sf['chars_num'].show()

Canvas is accessible via web browser at the URL: http://localhost:57261/index.html
Opening Canvas in default web browser.


In [20]:
# Calculating number of words in a text. Various approaches.
# Note differences in how each approach splits words & punctuation! 

text = """I think that you know me well enough, Watson, to understand that I am by no means a nervous man. At the same time,
it is stupidity rather than courage to refuse to recognize danger when it is close upon you."""

# Using .split()

print "##########"
print "Using .split() method"
print "##########"
print text.split()
print "##########"
print len(text.split())
print "##########"

# Using regular expressions

re_words_split = re.compile("(\w+)")

print "##########"
print "Using Regular Expressions"
print "##########"
print re_words_split.findall(text)
print "##########"
print len(re_words_split.findall(text))
print "##########"

# Using Natural Language Toolkit (NLTK)
# Note from DATO: Remember to download the NLTK's punkt package by running nltk.download() from the Interactive Python Shell
import nltk
nltk.download('punkt')

print "##########"
print "Using NLTK"
print "##########"
print nltk.word_tokenize(text)
print "##########"
print len(nltk.word_tokenize(text))
print "##########"

##########
Using .split() method
##########
['I', 'think', 'that', 'you', 'know', 'me', 'well', 'enough,', 'Watson,', 'to', 'understand', 'that', 'I', 'am', 'by', 'no', 'means', 'a', 'nervous', 'man.', 'At', 'the', 'same', 'time,', 'it', 'is', 'stupidity', 'rather', 'than', 'courage', 'to', 'refuse', 'to', 'recognize', 'danger', 'when', 'it', 'is', 'close', 'upon', 'you.']
##########
41
##########
##########
Using Regular Expressions
##########
['I', 'think', 'that', 'you', 'know', 'me', 'well', 'enough', 'Watson', 'to', 'understand', 'that', 'I', 'am', 'by', 'no', 'means', 'a', 'nervous', 'man', 'At', 'the', 'same', 'time', 'it', 'is', 'stupidity', 'rather', 'than', 'courage', 'to', 'refuse', 'to', 'recognize', 'danger', 'when', 'it', 'is', 'close', 'upon', 'you']
##########
41
##########
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/matthewkrey/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
##########
Using NLTK
##########
['I', 'think', 'that', 'yo

In [21]:
# Count number of words using regex approach 

sf['words_num'] = sf['text'].apply(lambda t: len(re_words_split.findall(text)))

In [22]:
# Count number of sentences using NLTK

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def txt2sentences(txt, remove_none_english_chars=True):
    """
    Split the English text into sentences using NLTK
    :param txt: input text.
    :param remove_none_english_chars: if True then remove none English chars from text
    :return: string in which each line consists of single sentence from the original input text.
    :rtype: str
    """
    # decode to utf8 to avoid encoding problems 
    txt = txt.decode("utf8")
    # split text into sentences using NLTK package
    for s in tokenizer.tokenize(txt):
        if remove_none_english_chars:
            # remove none English chars
            s = re.sub("[^a-zA-Z]", " ", s)
            yield s 

# count number of sentences
sf['sentences_num'] = sf['text'].apply(lambda t: len(list(txt2sentences(t))))

# visualize 
sf[['chars_num','words_num','sentences_num']].show()

Canvas is updated and available in a tab in the default browser.


In [23]:
# More advanced text analysis with GraphLab's text_analytics.count_words toolkit and collection.Counter function

sf['words_count'] = gl.text_analytics.count_words(sf['text'], to_lower=True)
sf['sherlock_count'] = sf['words_count'].apply(lambda d: d.get('sherlock',0))
sf['watson_count'] = sf['words_count'].apply(lambda d: d.get('watson',0))
sf['elementary_count'] = sf['words_count'].apply(lambda d: d.get('elementary',0))
sf[['sherlock_count', 'watson_count', 'elementary_count']].show()

Canvas is updated and available in a tab in the default browser.


In [24]:
# Attempt to use Linear Regression to predict the number of times the word 'Sherlock' appears in a text based on the number of times the word 'Watson' appears in the text.

linear_reg = gl.linear_regression.create(sf, target='sherlock_count', features=['watson_count'])
linear_reg.show()

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 65
PROGRESS: Number of features          : 1
PROGRESS: Number of unpacked features : 1
PROGRESS: Number of coefficients    : 2
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Training-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: | 1         | 2        | 1.008692     | 75.095997          | 20.931039     |
PROGRESS: +-----------+----------+--------------+--------------------+---------------+
PROGRESS: SUCCESS: Optimal solution found.
PROGRESS:
Canvas is updated and available in a tab in the default browser.
