In [83]:
import urllib
import glob
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 

def load_gutenberg_book(url, char_limit=10000, min_len_of_sections=40):
    """
    Returns a list of paragraphs in the book.
    
    url: A url from Project Gutenberg.
    char_limit: Amount of characters of the book to read.
    min_len_of_sections: Each paragraph must be at least this many characters long.
    """
    book = urllib.urlopen(url)
    book_text = book.read(char_limit if char_limit else -1)
    
    result = []
    for text in book_text[:char_limit].split("\r\n\r\n"):
        if len(text) >= min_len_of_sections:
            clean_text = text.replace("\r\n", " ").strip()
            result.append(clean_text)
    
    start_position = len(result) if len(result) < 6 else 6
    return result[start_position:]

def get_text(path):
    encoding_options = "ascii utf-8 utf-16 utf-32 utf-16-be utf-16-le utf-32-be utf-32-le".split()
    
    for encoding in encoding_options:
        try:
            with open(path, encoding=encoding) as book:
                return book.read()
        except UnicodeDecodeError:
            continue
    raise ValueError

def extract_term(term_indicator, text, default=None, max_term_size=75):
    term_start = text.find(term_indicator)
    # If not found, 
    if term_start == -1:
        term = default
    else:
        term_end = text.find("\n", term_start)
        term = text[term_start+len(term_indicator):term_end].strip()
    if term and (len(term) > max_term_size):
        term = default
    return term

def get_author_and_title(book_text):
    title = extract_term("Title:", book_text, default=None)
    author = extract_term("Author:", book_text, default=None)
    # Solve  for other strange author name formatting
    for term_indicator in ["\n\nby ", "\n\nOF ", "\nOF\n"]:
        if author is None:
            author = extract_term(term_indicator, book_text[:15000], max_term_size=25)
    return title, author

def locate_beginning_of_text(title, author, text):
    location = text.find("START OF THIS PROJECT GUTENBERG") + 25
    
    if location < 0:
        if title:
            location = text.find(title)
        if author:
            location = text.find(author)
            
    return location

def locate_end_of_text(text):
    f = text.find
    
    search_terms = ["End of Project Gutenberg",
                    "END OF THIS PROJECT GUTENBERG EBOOK",
                    "END OF THE PROJECT GUTENBERG EBOOK",
                    "End of the Project Gutenberg Etext"]
    
    location = max([f(term) for term in search_terms])
    if location < 0:
        print("Fail")
        location = None
    return location

def parse_book(book_text, min_paragraph_characters=100):
    """
    Given the text of a book, returns a list of dictionaries with the keys:
    {title, author, contents, part, hash}
    """
    parsed_book_paragraphs = []
    title, author = get_author_and_title(book_text)
    text_starts = locate_beginning_of_text(title, author, book_text)
    text_ends = locate_end_of_text(book_text)
    book_paragraphs = book_text[text_starts:text_ends].split("\n\n")
    for paragraph_number, raw_paragraph in enumerate(book_paragraphs):
        paragraph = raw_paragraph.replace("\n", " ").strip()
        if len(paragraph) < min_paragraph_characters:
            continue
        if "gutenberg" in paragraph.lower():
            continue
        book_data = {"title": title,
                     "author": author,
                     "contents": paragraph,
                     "part": paragraph_number}
        parsed_book_paragraphs.append(book_data)
    return parsed_book_paragraphs            

def get_list_of_book_paths(book_directory):
    return list(glob.iglob(book_directory + '/*.txt'))

def books_to_pandas(book_directory, min_paragraph_characters=100):
    paragraphs = []

    for filename in get_list_of_book_paths(book_directory):
        book_text = get_text(filename)
        parsed_book = parse_book(book_text, min_paragraph_characters)
        paragraphs.extend(parsed_book)
    
    return pd.DataFrame(paragraphs)

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

def cosine_similarity(new_docs, old_docs):
    """
    Returns a similarity matrix where the first row is an array of
    similarities of the first new_doc compared with each of the old
    docs.
    """
    return new_docs*old_docs.T

def find_closest_matches(similarity_matrix, n_matches_to_return=1):
    """
    Expects a dense array of the form [[1., .5, .2],
                                       [.3, 1., .1],
                                       [.2, .4, 1.]]
    where rows correspond to similarities.
    """
    top_indices = np.apply_along_axis(func1d=lambda x: x.argsort()[-n_matches_to_return:][::-1], 
                                      axis=1, 
                                      arr=similarity_matrix)
    return top_indices

In [51]:
books = books_to_pandas("popular_books")

In [57]:
books.head()

Unnamed: 0,author,contents,part,title
0,"Alexandre Dumas, Pere","On the 24th of February, 1815, the look-out at...",6,THE COUNT OF MONTE CRISTO
1,"Alexandre Dumas, Pere","As usual, a pilot put off immediately, and rou...",7,THE COUNT OF MONTE CRISTO
2,"Alexandre Dumas, Pere","Immediately, and according to custom, the ramp...",8,THE COUNT OF MONTE CRISTO
3,"Alexandre Dumas, Pere",The ship drew on and had safely passed the str...,9,THE COUNT OF MONTE CRISTO
4,"Alexandre Dumas, Pere",The vague disquietude which prevailed among th...,10,THE COUNT OF MONTE CRISTO


In [91]:
vect = TfidfVectorizer(max_df=.7, min_df=100, tokenizer=LemmaTokenizer()).fit_transform(books[:20000].contents)     
similarities = cosine_similarity(vect, vect).todense()
matches = find_closest_matches(similarities, 2)

In [92]:
top_score = 0

for new_text, old_texts in enumerate(matches[:]):
    max_score = max([float(similarities[[new_text],[ind]]) for ind in old_texts[1:]])
    if top_score < max_score:
        top_score = max_score
        print (max_score)
        similar_texts = [(float(similarities[[new_text],[ind]]), books.contents.ix[ind]) for ind in old_texts[1:]]
        print (books.contents.ix[new_text])
        print (similar_texts)
        print (new_text, old_texts)
        print ()

0.7872149245626546
On the 24th of February, 1815, the look-out at Notre-Dame de la Garde signalled the three-master, the Pharaon from Smyrna, Trieste, and Naples.
[(0.7872149245626546, '"It is the social capital of a theatre on the boulevard, or a railroad from the Jardin des Plantes to La Rapee."')]
0 [   0 2051]

0.9111984962475744
"'The king's attorney is informed by a friend to the throne and the religions institutions of his country, that one named Edmond Dantes, mate of the ship Pharaon, this day arrived from Smyrna, after having touched at Naples and Porto-Ferrajo, has been the bearer of a letter from Murat to the usurper, and again taken charge of another letter from the usurper to the Bonapartist club in Paris. Ample corroboration of this statement may be obtained by arresting the above-mentioned Edmond Dantes, who either carries the letter for Paris about with him, or has it at his father's abode. Should it not be found in the possession of father or son, then it will assured