In [1]:
import urllib

book_urls = {"Jane Austin":"http://www.gutenberg.org/cache/epub/1342/pg1342.txt",
             "Alice in Wonderland":"http://www.gutenberg.org/cache/epub/11/pg11.txt",
             "The Yellow Wallpaper":"http://www.gutenberg.org/cache/epub/1952/pg1952.txt",
             "Huckleberry Finn":"http://www.gutenberg.org/cache/epub/76/pg76.txt",
             "The Importance of Being Earnest":"http://www.gutenberg.org/cache/epub/844/pg844.txt"
             }

def load_gutenberg_book(url, char_limit=10000, min_len_of_sections=40):
    """
    Returns a list of paragraphs in the book.
    
    url: A url from Project Gutenberg.
    char_limit: Amount of characters of the book to read.
    min_len_of_sections: Each paragraph must be at least this many characters long.
    """
    book = urllib.urlopen(url)
    book_text = book.read(char_limit if char_limit else -1)
    
    result = []
    for text in book_text[:char_limit].split("\r\n\r\n"):
        if len(text) >= min_len_of_sections:
            clean_text = text.replace("\r\n", " ").strip()
            result.append(clean_text)
    
    start_position = len(result) if len(result) < 6 else 6
    return result[start_position:]

In [29]:
ja = load_gutenberg_book(book_urls["Jane Austin"])
aw = load_gutenberg_book(book_urls["Alice in Wonderland"], char_limit=None)

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [31]:
vect = TfidfVectorizer()
awt = vect.fit_transform(aw) 

In [32]:
import numpy as np

In [33]:
def cosine_similarity(new_docs, old_docs):
    """
    Returns a similarity matrix where the first row is an array of
    similarities of the first new_doc compared with each of the old
    docs.
    """
    return new_docs*old_docs.T

def find_closest_matches(similarity_matrix, n_matches_to_return=1):
    """
    Expects a dense array of the form [[1., .5, .2],
                                       [.3, 1., .1],
                                       [.2, .4, 1.]]
    where rows correspond to similarities.
    """
    top_indices = np.apply_along_axis(func1d=lambda x: x.argsort()[-n_matches_to_return:][::-1], 
                                      axis=1, 
                                      arr=similarity_matrix)
    return top_indices
    
similarities = cosine_similarity(awt, awt).todense()
matches = find_closest_matches(similarities, 2)

In [35]:
top_score = 0

for new_text, old_texts in enumerate(matches[:]):
    max_score = max([float(similarities[[new_text],[ind]]) for ind in old_texts[1:]])
    if top_score < max_score:
        top_score = max_score
        print max_score
        similar_texts = [(float(similarities[[new_text],[ind]]), aw[ind]) for ind in old_texts[1:]]
        print aw[new_text]
        print similar_texts
        print new_text, old_texts
        print

0.242165356148
So she was considering in her own mind (as well as she could, for the hot day made her feel very sleepy and stupid), whether the pleasure of making a daisy-chain would be worth the trouble of getting up and picking the daisies, when suddenly a White Rabbit with pink eyes ran close by her.
[(0.24216535614796228, "Alice thought she might as well go back, and see how the game was going on, as she heard the Queen's voice in the distance, screaming with passion. She had already heard her sentence three of the players to be executed for having missed their turns, and she did not like the look of things at all, as the game was in such confusion that she never knew whether it was her turn or not. So she went in search of her hedgehog.")]
0 [  0 425]

0.342408595348
There was nothing so VERY remarkable in that; nor did Alice think it so VERY much out of the way to hear the Rabbit say to itself, 'Oh dear! Oh dear! I shall be late!' (when she thought it over afterwards, it occurred