In [11]:
import re 
import urllib2 
import graphlab as gl
import nltk

BASE_DIR = "/Users/matthewkrey/anaconda2/DATO/sherlock" # match BASE_DIR to your local directory path

books_url = "http://sherlock-holm.es/ascii/"
re_books_links = re.compile("\"piwik_download\"\s+href=\"(?P<link>.*?)\">(?P<title>.*?)</a>", re.MULTILINE)
html = urllib2.urlopen(books_url).read()
books_list = [m.groupdict() for m in re_books_links.finditer(html)]

# Filter books due to copyright issues. In this code, we filtered "The Complete Canon", “Case-Book of Sherlock Holmes” books, and
# "The Canon — U.S. edition" book (For more information please read the note above).

filtered_books = set(["The Complete Canon", "The Case-Book of Sherlock Holmes", "The Canon — U.S. edition" ])
books_list = filter(lambda d: d['title'] not in filtered_books, books_list)

# Download books' texts (to not overload the website we download the text in batch and not in parallel)
for d in books_list:
    d['text'] = urllib2.urlopen("http://sherlock-holm.es" + d['link']).read().strip()
    
sf = gl.SFrame(books_list).unpack("X1", column_name_prefix="")
sf.save("%s/books.sframe" % BASE_DIR)
sf.head(3)


link,text,title
/stories/plain- text/advs.txt ...,THE ADVENTURES OF SHERLOCK HOLMES\n\n ...,The Adventures of Sherlock Holmes ...
/stories/plain- text/mems.txt ...,THE MEMOIRS OF SHERLOCK HOLMES\n\n ...,The Memoirs of Sherlock Holmes ...
/stories/plain- text/retn.txt ...,THE RETURN OF SHERLOCK HOLMES\n\n ...,The Return of Sherlock Holmes ...


In [12]:
gl.canvas.set_target('ipynb')

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def txt2sentences(txt, remove_none_english_chars=True):
    """
    Split the English text into sentences using NLTK
    :param txt: input text.
    :param remove_none_english_chars: if True then remove none English chars from text
    :return: string in which each line consists of single sentence from the original input text.
    :rtype: str
    """
    txt = txt.decode("utf8")
    # split text into sentences using nltk packages 
    for s in tokenizer.tokenize(txt):
        if remove_none_english_chars:
            # remove none English chars
            s = re.sub("[^a-zA-Z]", " ", s)
        yield s 
        
sf = gl.load_sframe("%s/books.sframe" % BASE_DIR)
sf['sentences'] = sf['text'].apply(lambda t: list(txt2sentences(t)))

In [13]:
sf_sentences = sf.flat_map(['title', 'text'], lambda t: [[t['title'],s.strip()] for s in txt2sentences(t['text'])])
sf_sentences = sf_sentences.rename({'text': 'sentence'})
re_words_split = re.compile("(\w+)")

# split each sentence into words 
sf_sentences['words'] = sf_sentences['sentence'].apply(lambda s:re_words_split.findall(s))
sf_sentences.save("%s/sentences.sframe" % BASE_DIR)
sf_sentences.head(3)

title,sentence,words
The Adventures of Sherlock Holmes ...,THE ADVENTURES OF SHERLOCK HOLMES ...,"[THE, ADVENTURES, OF, SHERLOCK, HOLMES, Art ..."
The Adventures of Sherlock Holmes ...,I have seldom heard him mention her under any ...,"[I, have, seldom, heard, him, mention, her, un ..."
The Adventures of Sherlock Holmes ...,In his eyes she eclipses and predominates ...,"[In, his, eyes, she, eclipses, and, ..."


In [14]:
# Using SFrame of sf_sentences containing single sentences, let's find out which two or more characters appear in the same sentences.

main_characters_set = set(["Irene","Mycroft","Lestrade","Sherlock","Moran","Moriarty","Watson"])
sf_sentences['characters'] = sf_sentences['words'].apply(lambda w: list(set(w) & main_characters_set))

In [16]:
import itertools
from collections import Counter
from graphlab import SGraph, Vertex, Edge

def get_characters_graph(sf, min_edge_stength=1):
    """
    Construct a social network from an input SFrame. In the social network the vertices are the characters
    and the edges are only between characters that appear in the same sentence at least min_edge_strength times
    :param sf: input SFrame object that contains 'characters' column
    :param min_edge_strength: minimal connection strength between two characters.
    :return: SGraph object constructed from the input SFrame. The graph only contains edges with
        at least the input minimal strength between the characters.
    :rtype: gl.SGraph
    """
    # filter sentences with less than two characters 
    sf['characters_num'] = sf['characters'].apply(lambda l: len(l))
    sf = sf_sentences[sf['characters_num'] > 1]
    characters_links = []
    for l in sf['characters']:
        # if there are more than two characters in the same sentences. Create all link combinations between 
        # all the characters (order doesn't matter)
        characters_links += itertools.combinations(l,2)
        
    # calculating the connections strength between each two characters
    c = Counter(characters_links)
    g = SGraph()
    
    edges_list = []
    for l,s in c.iteritems():
        if s < min_edge_stength:
            # filter out connections that appear less than min_edge_strength
            continue 
        edges_list.append(Edge(l[0], l[1], attr={'strength':s}))
        
    g = g.add_edges(edges_list)
    return g 

g = get_characters_graph(sf_sentences)
g.show(vlabel="__id", elabel="strength", node_size=200)

In [17]:
# Add minor characters to the graph 

minor_characters_set = set(["Irene","Mycroft","Lestrade","Sherlock","Moran","Moriarity","Watson","Baynes","Billy","Bradstreet","Gregson"
                           ,"Hopkins","Hudson","Shinwell","Athelney","Mary","Langdale","Toby","Wiggins"])

sf_sentences['characters'] = sf_sentences['words'].apply(lambda w: list(set(w) & minor_characters_set))
sf_sentences['characters_num'] = sf_sentences['characters'].apply(lambda l: len(l))
sf_sentences = sf_sentences[sf_sentences['characters_num'] > 1]

g = get_characters_graph(sf_sentences)
g.show(vlabel="__id", elabel="strength", node_size=200)