In [98]:
# %%file get_books.sh
#!/usr/bin/bash
!wget -nd -r -l 10 -A.txt ftp://ftp.ibiblio.org/pub/docs/books/gutenberg/

Writing get_books.sh


In [31]:
# %%file books.py
#!/usr/bin/env python

import glob
from hashlib import md5
import requests

def is_not_digit(path):
    just_name = path.split("/")[-1].split(".")[0]
    return not just_name.isdigit()

def not_good_file(path):
    return "-" in path or is_not_digit(path)

def md5_hash(k):
    return md5(k.encode()).hexdigest()

def get_text(path):
    with open(path) as book:
        return book.read()

def extract_term(term_indicator, text, default=None, max_term_size=75):
    term_start = text.find(term_indicator)
    # If not found, 
    if term_start == -1:
        term = default
    else:
        term_end = text.find("\n", term_start)
        term = text[term_start+len(term_indicator):term_end].strip()
    if term and (len(term) > max_term_size):
        term = default
    return term

def get_author_and_title(book_text):
    title = extract_term("Title:", book_text, default=None)
    author = extract_term("Author:", book_text, default=None)
    # Solve  for other strange author name formatting
    for term_indicator in ["\n\nby ", "\n\nOF ", "\nOF\n"]:
        if author is None:
            author = extract_term(term_indicator, book_text[:15000], max_term_size=25)
    return title, author

def locate_beginning_of_text(title, author, text):
    if title:
        location = text.find(title)
    if author:
        location = text.find(author)
    return location

def parse_book(book_text):
    """
    Given the text of a book, returns a list of dictionaries with the keys:
    {title, author, contents, part, hash}
    """
    parsed_book_paragraphs = []
    title, author = get_author_and_title(book_text)
    if title or author:
        # Get start of text position
        text_starts = locate_beginning_of_text(title, author, book_text)
        book_paragraphs = book_text[text_starts:].split("\n\n")
        for paragraph_number, raw_paragraph in enumerate(book_paragraphs):
            if len(raw_paragraph) < 30: continue # Skip short paragraphs
            paragraph = raw_paragraph.replace("\n", " ").strip()
            book_data = {"title": title,
                         "author": author,
                         "contents": paragraph,
                         "part": paragraph_number,
                         "hash": md5_hash(paragraph)}
            parsed_book_paragraphs.append(book_data)
    return parsed_book_paragraphs            

def save_all_paragraphs_to_elasticsearch(paragraphs, _index="test", _type="test"):
    counter = 125
    for index, paragraph in enumerate(paragraphs):
        if index > counter*2:
            print(index)
            counter = index
        post_to_elasticsearch(_index=_index, _type=_type, payload=paragraph)
       
    
def post_to_elasticsearch(endpoint='http://ec2-52-90-47-54.compute-1.amazonaws.com:9200/', 
                          _index=None, _type=None, payload=None):
    "Only adds new entries. Doesn't update entries."
    if _index: endpoint += _index + "/"
    if _type: endpoint += _type
    return requests.post(endpoint, json=payload, headers={'X-Forwarded-For': '100.100.0.144'})

# r = post_to_elasticsearch(_index="actor", _type="Cruise", payload={"age":43, "best_movie":"Top Gun"})

def get_from_elasticsearch(endpoint='http://ec2-52-90-47-54.compute-1.amazonaws.com:9200/', 
                           _index=None, _type=None, query=None):
    if _index: 
        endpoint += _index + "/"
        if _type: endpoint += _type + "/"
    endpoint += "_search?pretty=true"
    endpoint += "&q=" + query
    print("URL:", endpoint)
    return requests.get(endpoint).text

# r = get_from_elasticsearch(_index="actor", _type="Cruise", query="best_movie:Top")

In [32]:
%%time
# The word "cat" is contained in 43 paragraphs of this text
print(get_from_elasticsearch(_index="test", _type="test", query="cat"))

URL: http://ec2-52-90-47-54.compute-1.amazonaws.com:9200/test/test/_search?pretty=true&q=cat
{
  "took" : 2,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 43,
    "max_score" : 1.3889651,
    "hits" : [ {
      "_index" : "test",
      "_type" : "test",
      "_id" : "AVQO-lCAIJj__yzfX1-K",
      "_score" : 1.3889651,
      "_source":{"part": 221, "contents": "_One of the most striking differences between a cat and a      lie is that a cat has only nine lives._ --Pudd'nhead      Wilson's Calendar", "hash": "2f6f7e53f386cc16caa6c2dcc4e22fd4", "author": "Mark Twain (Samuel Clemens)", "title": "The Tragedy of Pudd'nhead Wilson"}
    }, {
      "_index" : "test",
      "_type" : "test",
      "_id" : "AVQO-dEwIJj__yzfX179",
      "_score" : 0.9282705,
      "_source":{"part": 70, "contents": "\"_You_ is, you black mud cat! Yah--yah--yah! I got somep'n' better to do den 'sociat'n' wid niggers as black as you is

In [33]:
%%time
# The word "dog" is found in 160 paragraphs. In this case, dogs are more popular than cats.
print(get_from_elasticsearch(_index="test", _type="test", query="dog"))

URL: http://ec2-52-90-47-54.compute-1.amazonaws.com:9200/test/test/_search?pretty=true&q=dog
{
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 160,
    "max_score" : 1.2527326,
    "hits" : [ {
      "_index" : "test",
      "_type" : "test",
      "_id" : "AVQO_4DwIJj__yzfX2Uc",
      "_score" : 1.2527326,
      "_source":{"part": 739, "contents": "\"You're a sly dog!\" said Passepartout, winking at him.", "hash": "0f39c97b89a0d2ed2edb22f3c1cdd86b", "author": "Jules Verne", "title": "Around the World in 80 Days"}
    }, {
      "_index" : "test",
      "_type" : "test",
      "_id" : "AVQO-bsqIJj__yzfX17k",
      "_score" : 1.2184082,
      "_source":{"part": 38, "contents": "\"I wish I owned half of that dog.\"", "hash": "e6824ce5d3141022a97ce968e391e658", "author": "Mark Twain (Samuel Clemens)", "title": "The Tragedy of Pudd'nhead Wilson"}
    }, {
      "_index" : "test",
      "_type" : "t

In [34]:
%%time
# Oooooohhh... scary. These are the 666th paragraphs of each book.
print(get_from_elasticsearch(_index="test", _type="test", query="part:666"))

URL: http://ec2-52-90-47-54.compute-1.amazonaws.com:9200/test/test/_search?pretty=true&q=part:666
{
  "took" : 2,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 7,
    "max_score" : 1.0,
    "hits" : [ {
      "_index" : "test",
      "_type" : "test",
      "_id" : "AVQOyKZQIJj__yzfXy2D",
      "_score" : 1.0,
      "_source":{"part": 666, "contents": "Enter the guard, rushing in", "hash": "1cd37ee24e0b8ac5bf5e3caede28fb28", "author": "William Shakespeare", "title": "The Complete Works of William Shakespeare"}
    }, {
      "_index" : "test",
      "_type" : "test",
      "_id" : "AVQO4C96IJj__yzfX0S_",
      "_score" : 1.0,
      "_source":{"part": 666, "contents": "As Alexander, in those torrid parts   Of India, beheld upon his host   Flames fall unbroken till they reached the ground.", "hash": "db83586cf005e293a84349d5a9992b5e", "author": "Dante Alighieri", "title": "Divine Comedy, Longfellow's Transla

In [35]:
%%time
# Return all paragraphs with the word "password" in them. 
print(get_from_elasticsearch(_index="test", _type="test", query="contents:password"))

URL: http://ec2-52-90-47-54.compute-1.amazonaws.com:9200/test/test/_search?pretty=true&q=contents:password
{
  "took" : 2,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 8,
    "max_score" : 1.4881856,
    "hits" : [ {
      "_index" : "test",
      "_type" : "test",
      "_id" : "AVQO9PqOIJj__yzfX1nK",
      "_score" : 1.4881856,
      "_source":{"part": 194, "contents": "He heard a man's voice calling to him, and distinguished the dim shape of a rider close by. He shouted that password of the range, \"Hello!\"", "hash": "adcee0d80ad978cc0561fbd7db04d3a1", "author": "by (AKA B. M. Sinclair) B. M. Bower", "title": "The Lure of the Dim Trails"}
    }, {
      "_index" : "test",
      "_type" : "test",
      "_id" : "AVQO7LIcIJj__yzfX1D4",
      "_score" : 1.3641403,
      "_source":{"part": 946, "contents": "Section 1029 is remarkably elastic.  Suppose you find a computer password in somebody's trash.  That

In [36]:
%%time
# Let's see titles with moon in them
print(get_from_elasticsearch(_index="test", _type="test", query="title:moon"))

URL: http://ec2-52-90-47-54.compute-1.amazonaws.com:9200/test/test/_search?pretty=true&q=title:moon
{
  "took" : 1,
  "timed_out" : false,
  "_shards" : {
    "total" : 5,
    "successful" : 5,
    "failed" : 0
  },
  "hits" : {
    "total" : 1249,
    "max_score" : 1.3635688,
    "hits" : [ {
      "_index" : "test",
      "_type" : "test",
      "_id" : "AVQO7-A4IJj__yzfX1RK",
      "_score" : 1.3635688,
      "_source":{"part": 31, "contents": "\"Not in the least,\" said I, placing myself beside him.", "hash": "e11f301b41acd72b0a4acc0ea2782278", "author": "H. G. Wells", "title": "The First Men In The Moon"}
    }, {
      "_index" : "test",
      "_type" : "test",
      "_id" : "AVQO7-EsIJj__yzfX1RL",
      "_score" : 1.3635688,
      "_source":{"part": 32, "contents": "\"My habits are regular. My time for intercourse--limited.\"", "hash": "539a4eed872cadfaeca7684fef7538e4", "author": "H. G. Wells", "title": "The First Men In The Moon"}
    }, {
      "_index" : "test",
      "_type

In [None]:
BOOK_DIRECTORY = "books"

if __name__ == "__main__":
    paragraphs = []

    for filename in list(glob.iglob(BOOK_DIRECTORY + '/*.txt')):
        path = filename.replace("\\", "/")

        # Skip files that are not books
        if not_good_file(path): continue

        # Parse book.
        book_text = get_text(path)
        parsed_book = parse_book(book_text)
        paragraphs.extend(parsed_book)
        
    save_to_elasticsearch = False
    
    if save_to_elasticsearch:
        save_all_paragraphs_to_elasticsearch(paragraphs, _index="test", _type="test")