In [27]:
from __future__ import division # means division can return decimals
import pickle

In [28]:
def remove_all_punctuation_and_numbers(word):
    '''
        Retain only the letters in the word
    '''
    return "".join([c for c in word if c.isalpha()])

In [29]:
read = file.read 

"""
    Read all of 'stop-word-list.csv'
    Values are comma seperated so split on ','
    Strip the whitespace round each word
    Probably could have used lstrip but the gain
    in efficiency is not worth the odds that there
    was a space between a word and a commma
"""
with open("stop-word-list.csv") as stopwords_file:
        stopwords = [word.strip() for word in read(stopwords_file).split(",")]

In [30]:
from collections import namedtuple # much more efficient than objects
from collections import defaultdict # a dict where you can set a default value
from collections import Counter # creates automatically key:frequency dictionary
import math

class VectorSpaceModel(object):
    
    Document = namedtuple('Document',['title',  'term_frequencies'])
    
    def __init__(self, stopwords = [], **kwargs):
        self.remove_punctuation = kwargs.get('punctuation', remove_all_punctuation_and_numbers) 
        self.stopwords = stopwords
      
        # dictionary to store how many documents a word was in
        self.inverse_document_counts = defaultdict(lambda: 0)   

        # downloaded the snowball stemming dictionary
        # created a python dictionary and pickled it
        self.stemming_dict = kwargs.get('stemming_dict', pickle.load( open( "snowball.p", "rb" ) ) )
        
        self.documents = [] # Holds named tuples
        self.number_of_documents = 0
        self.idf_squared = {} # key = document frequency : value = idf**2 to speed up calculations
    
    
    def stem(self, word):
        """
            Remove whitespace from the word and change to lower case for comparison
            Check if it is in stopwords:
            if it is look it up in the stemming dictionary
            if it is not return None
        """
        word = self.remove_punctuation(word.strip().lower()) # remove punctuation and change to lower case
        if word not in self.stopwords:
            return self.stemming_dict.get(word, word) # find a stem else return the word as is
        else:
            return None # this is a stopword so we ignore it

    
    def create_document(self, title, contents):
        """
            Find all the words in this document
            document_term_frequencies is how many times a word appears divided by the most
            frequent word in the document. These are are used along with title to create
            a named tuple for this document
            inverse_document_count refers to how many documents a term appears in. A document
            does not have knowledge of the collection it is in so can only incrememt the count.
        """
        title=title.strip()
        document_term_frequencies = {}
        
        from itertools import ifilter

        # stem returns None for stopwords so we must remove them
        contents = ifilter(None,(self.stem(word) for word in contents.split())) 

        counter = Counter(contents)

        most_frequent_keyword = counter.most_common(1)[0][1]

        for kw in counter.keys():

            document_term_frequencies[kw] = counter[kw]/most_frequent_keyword # term frequency
            self.inverse_document_counts[kw] += 1 # inverse document count


        doc_tuple = VectorSpaceModel.Document(title = title,  term_frequencies = document_term_frequencies)

        self.documents.append( doc_tuple )
            
         
    def tf_idf(self):

        tf_idf = defaultdict(lambda: [])
        set_of_document_counts = set(self.inverse_document_counts.values()) 
        self.number_of_documents = len(self.documents)

        # cache the values of idf^2 for a given document frequency
        # assumes document idf = query idf
        self.idf_squared = {count : (self.inverse_document_frequency_method(count)**2) for count in set_of_document_counts}
        self.idf_squared.update({0:0})
        # Get the lengths of vectors
        self.document_lengths = {document.title: self.get_vector_length(document) for document in self.documents}
     
   
    def get_vector_length(self,document):
        
        """
            Vector length is math.sqrt(sum(tf_idf**2))
            tf_idf**2 = tf*idf*tf*idf = tf**2 * idf**2
            
            idf_squared already calculated
            tf is frequency divide by most frequent word
            
        """
        tf_dict = document.term_frequencies
        idf_squared = self.idf_squared
        document_counts = self.inverse_document_counts
        
        
        query_length_squared = sum(idf_squared[document_counts [kw]] * (tf**2) for kw, tf in tf_dict.iteritems())
            
        vector_length = math.sqrt(query_length_squared)
        
        return vector_length
    
    def query(self, string):
        
        from itertools import ifilter
        
        keywords = ifilter (None, (self.stem(word) for word in string.split())) # Stemming
        
        counter = Counter(keywords) # Count frequencies
        
        most_frequent = counter.most_common(1)[0][1] # returns list of tuples hence long access line
        query_tf_dict = {key: value/most_frequent for key, value in counter.iteritems()}
        
        # get_vector_length expects a document so turn query into document
        query = self.Document(title="query",  term_frequencies=query_tf_dict)
        query_length = self.get_vector_length(query)
        
        
        """
            Dot product:
            
            sum of
            for key value in tf_dict:
                
                (query tf * term idf) * (document tf * term idf)
                = query tf * document tf * term idf squared 
        """
        
        document_scores = []
        titles = [document.title for document in self.documents]
        
        for document in self.documents:    
                                                                    
            dot_product = sum(document.term_frequencies.get(key, 0) * query_tf * 
                                      self.idf_squared[self.inverse_document_counts[key]] \
                                      for key, query_tf in query_tf_dict.iteritems())
      
            
            document_length = self.document_lengths[document.title]
            
            cos = dot_product/(document_length * query_length)
            
            document_scores.append(cos)
            
        return sorted(zip(titles, document_scores), key = lambda x: x[1], reverse = True)[:10]
    
    
    def inverse_document_frequency_method(self, doc_frequency):
        if doc_frequency:
            return math.log10(self.number_of_documents/doc_frequency)
        else: return 0 # to avoid divison by zero

In [31]:
VSM = VectorSpaceModel(stopwords)

In [32]:
import bs4
from bs4 import BeautifulSoup
import re

for page in pickle.load(open('vpl.p', 'r')):
    with open(page) as f:
        source = f.read()
        soup = BeautifulSoup(source)
    
        # http://stackoverflow.com/questions/5598524/can-i-remove-script-tags-with-beautifulsoup
        [script.extract() for script in soup('script')]
        # Removing comments appears to be btoken in Beautiful Soup so use regular expression instead
        # http://stackoverflow.com/questions/28208186/how-remove-html-comments-using-regex-in-python
        # Can't figure out how to remove comments within comments
        # Could use re.subn which returns a tuple (string, number_of_replacements_made)
        # until number_of_replacements_made = 0
        source = re.sub("(<!--.*?-->)", "", soup.get_text(), flags=re.MULTILINE)
            
        VSM.create_document(page, source)

In [33]:
VSM.tf_idf()

In [36]:
queries = [
    "Curiouser said Alice",
    "late white rabbit",
    "Snark hunting is fun",
    "Vorpal blade",
    "Snicker Carpenter Snack Oyster",
    "Professor Gardener Waggly Revenge",
]

for query in queries:
    filename = "Queries/" + query + ".txt"
    results = VSM.query(query)
    with open(filename, "w") as f:
        for page, value in results:
            line = " --> " . join ([page, str(value)])
            f.write(line + "\n" )

In [38]:
while True:
    print "Please enter a query (a blank query will exit the program)."
    input = raw_input()
    
    if not input:
        break
        
    for page,value in VSM.query(input):
        print page, value
        
    print ""

Please enter a query (a blank query will exit the program).
Alice
literature.offline/authors/carroll-lewis/through-the-looking-glass/chapter-02.html 0.403044578408
literature.offline/authors/carroll-lewis/through-the-looking-glass/chapter-05.html 0.368241212534
literature.offline/authors/carroll-lewis/through-the-looking-glass/chapter-09.html 0.366914084489
literature.offline/authors/carroll-lewis/alices-adventures-in-wonderland/chapter-08.html 0.358736909461
literature.offline/authors/carroll-lewis/alices-adventures-in-wonderland/chapter-06.html 0.351277210862
literature.offline/authors/carroll-lewis/through-the-looking-glass/chapter-03.html 0.349364379233
literature.offline/authors/carroll-lewis/alices-adventures-in-wonderland/chapter-01.html 0.349077170615
literature.offline/authors/carroll-lewis/alices-adventures-in-wonderland/chapter-09.html 0.315926586011
literature.offline/authors/carroll-lewis/alices-adventures-in-wonderland/chapter-05.html 0.292252467386
literature.offline/aut