In [2]:
import Queue

visited_pages = {} # Dictionary of pages with value a list of links found on that page
visited_pages_list = [] # Keep trck of the order in which pages were visited

In [3]:
from bs4 import BeautifulSoup

def get_source(old_absolute_link):
    # assert that we are visiting a new page
    try:
        assert(old_absolute_link not in visited_pages_list)
    except:
        """
            If we have visited the page already then
            we should act according to our reindexing strategy
            Since the pages are static we do not need one
        """
        print "PAGE ALREADY VISITED", old_absolute_link
    
    
    visited_pages[old_absolute_link] = [] # We will add links found on the page to this list
    visited_pages_list.append(old_absolute_link) # This list retains the order in which
    
    with open(old_absolute_link) as f:
        source = f.read()
        return source
    
seed = "literature.offline/authors/carroll-lewis/index.html"

In [4]:
from collections import namedtuple # much more efficient than objects
from __future__ import division # means division can return decimals
from collections import defaultdict # a dict where you can set a default value
from collections import Counter # creates automatically key:frequency dictionary
import math

def remove_all_punctuation_and_numbers(word):
    '''
        Retain only the letters in the word
    '''
    return "".join([c for c in word if c.isalpha()])

In [5]:
read = file.read 

"""
    Read all of 'stop-word-list.csv'
    Values are comma seperated so split on ','
    Strip the whitespace round each word
    Probably could have used lstrip but the gain
    in efficiency is not worth the odds that there
    was a space between a word and a commma
"""
with open("stop-word-list.csv") as stopwords_file:
        stopwords = [word.strip() for word in read(stopwords_file).split(",")]
        

In [6]:
import pickle 

class VectorSpaceModel(object):
    
    Document = namedtuple('Document',['title',  'term_frequencies'])
    
    def __init__(self, stopwords = [], **kwargs):
        self.remove_punctuation = kwargs.get('punctuation', remove_all_punctuation_and_numbers) 
        self.stopwords = stopwords
      
        # dictionary to store how many documents a word was in
        self.inverse_document_counts = defaultdict(lambda: 0)   

        # downloaded the snowball stemming dictionary
        # created a python dictionary and pickled it
        self.stemming_dict = kwargs.get('stemming_dict', pickle.load( open( "snowball.p", "rb" ) ) )
        
        self.documents = [] # Holds named tuples
        self.number_of_documents = 0
        self.idf_squared = {} # key = document frequency : value = idf**2 to speed up calculations
    
    
    def stem(self, word):
        """
            Remove whitespace from the word and change to lower case for comparison
            Check if it is in stopwords:
            if it is look it up in the stemming dictionary
            if it is not return None
        """
        word = self.remove_punctuation(word.strip().lower()) # remove punctuation and change to lower case
        if word not in self.stopwords:
            return self.stemming_dict.get(word, word) # find a stem else return the word as is
        else:
            return None # this is a stopword so we ignore it

    
    def create_document(self, title, contents):
        """
            Find all the words in this document
            document_term_frequencies is how many times a word appears divided by the most
            frequent word in the document. These are are used along with title to create
            a named tuple for this document
            inverse_document_count refers to how many documents a term appears in. A document
            does not have knowledge of the collection it is in so can only incrememt the count.
        """
        title=title.strip()
        document_term_frequencies = {}
        
        from itertools import ifilter

        # stem returns None for stopwords so we must remove them
        contents = ifilter(None,(self.stem(word) for word in contents.split())) 

        counter = Counter(contents)

        most_frequent_keyword = counter.most_common(1)[0][1]

        for kw in counter.keys():

            document_term_frequencies[kw] = counter[kw]/most_frequent_keyword # term frequency
            self.inverse_document_counts[kw] += 1 # inverse document count


        doc_tuple = VectorSpaceModel.Document(title = title,  term_frequencies = document_term_frequencies)

        self.documents.append( doc_tuple )
            
         
    def tf_idf(self):

        tf_idf = defaultdict(lambda: [])
        set_of_document_counts = set(self.inverse_document_counts.values()) 
        self.number_of_documents = len(self.documents)

        # cache the values of idf^2 for a given document frequency
        # assumes document idf = query idf
        self.idf_squared = {count : (self.inverse_document_frequency_method(count)**2) for count in set_of_document_counts}
        self.idf_squared.update({0:0})
        # Get the lengths of vectors
        self.document_lengths = {document.title: self.get_vector_length(document) for document in self.documents}
     
   
    def get_vector_length(self,document):
        
        """
            Vector length is math.sqrt(sum(tf_idf**2))
            tf_idf**2 = tf*idf*tf*idf = tf**2 * idf**2
            
            idf_squared already calculated
            tf is frequency divide by most frequent word
            
        """
        tf_dict = document.term_frequencies
        idf_squared = self.idf_squared
        document_counts = self.inverse_document_counts
        
        
        query_length_squared = sum(idf_squared[document_counts [kw]] * (tf**2) for kw, tf in tf_dict.iteritems())
            
        vector_length = math.sqrt(query_length_squared)
        
        return vector_length
    
    def query(self, string):
        
        from itertools import ifilter
        
        keywords = ifilter (None, (self.stem(word) for word in string.split())) # Stemming
        
        counter = Counter(keywords) # Count frequencies
        
        most_frequent = counter.most_common(1)[0][1] # returns list of tuples hence long access line
        query_tf_dict = {key: value/most_frequent for key, value in counter.iteritems()}
        
        # get_vector_length expects a document so turn query into document
        query = self.Document(title="query",  term_frequencies=query_tf_dict)
        query_length = self.get_vector_length(query)
        
        
        """
            Dot product:
            
            sum of
            for key value in tf_dict:
                
                (query tf * term idf) * (document tf * term idf)
                = query tf * document tf * term idf squared 
        """
        
        document_scores = []
        titles = [document.title for document in self.documents]
        
        for document in self.documents:    
                                                                    
            dot_product = sum(document.term_frequencies.get(key, 0) * query_tf * 
                                      self.idf_squared[self.inverse_document_counts[key]] \
                                      for key, query_tf in query_tf_dict.iteritems())
      
            
            document_length = self.document_lengths[document.title]
            
            cos = dot_product/(document_length * query_length)
            
            document_scores.append(cos)
            
        return sorted(zip(titles, document_scores), key = lambda x: x[1], reverse = True)[:10]
    
    
    def inverse_document_frequency_method(self, doc_frequency):
        if doc_frequency:
            return math.log10(self.number_of_documents/doc_frequency)
        else: return 0 # to avoid divison by zero
            
            
        
VSM = VectorSpaceModel(stopwords)

In [7]:


def get_links(old_absolute_link, soup):
    
    valid_pages_extensions = ['htm', 'html']
    
    def not_visited(href): return href not in visited_pages_list and href not in page_queue.queue
    
    def not_online(href):  return not (href.startswith('http:') or href.startswith('https:') or href.startswith('www.'))
    
    def get_link_extension(href): return href.split('.')[-1]
    
    def valid_crawl(href): return not_online(href) and get_link_extension(href).lower() in valid_pages_extensions
    
    def get_new_absolute_link(absolute, relative):
        """
            absolute is a link in the format a/b/c/d.html
            relative is a link with directions from the current directory
            if relative is e.html then we simply replace d.html with e.html
            giving a/b/c/e.html
            
            if relative starts with one or more ../ then we need to remove earlier directories
            a/b/c/d.html + ../../e.html
            we have to go back two directories (remove b and c)
            giving a/e.html
        """
    
        
        back_directories_counter = 0
        
        while (relative.startswith("../")):
            relative = relative[3:]
            back_directories_counter += 1
        
        components = absolute.split("/")[:-(1+back_directories_counter)]
        components.append(relative)
        new_link = "/".join(components).split('#')[0] # must check not seen
        return "/".join(components)
    
    
    this_pages_links_set = set()
    
    
    for link in soup.findAll('a'):
        
        try:
            href = link['href']
        except:
            continue
            
        new_absolute_link = get_new_absolute_link(old_absolute_link, href)
        
        if valid_crawl(href) and valid_crawl(new_absolute_link):
            if new_absolute_link not in this_pages_links_set:
                this_pages_links_set.add(new_absolute_link)
                visited_pages[old_absolute_link].append(new_absolute_link)
            # if not visited
            if not_visited(new_absolute_link):
                page_queue.put(new_absolute_link)
                

                
            

In [8]:
# pages to visit
page_queue = Queue.Queue()

page_queue.put(seed)



while (page_queue.qsize() > 0):
    next_page_link = page_queue.get()
    source = get_source(next_page_link)
    soup = BeautifulSoup(source)
    
    #http://stackoverflow.com/questions/5598524/can-i-remove-script-tags-with-beautifulsoup
    [script.extract() for script in soup('script')]
    for comment in soup('Comment'):
        print "Comment Found", comment
    
    #get_links(next_page_link, soup)
    #VSM.create_document(next_page_link, soup.get_text())

In [9]:


VSM.tf_idf()

In [10]:
queries = [
    "Curiouser said Alice",
    "late white rabbit",
    "Snark hunting is fun",
    "Vorpal blade",
    "Snicker Carpenter Snack Oyster",
    "Professor Gardener Waggly Revenge",
]

for query in queries:
    filename = "Queries/" + query + ".txt"
    results = VSM.query(query)
    with open(filename, "a+") as f:
        for page, value in results:
            line = " --> " . join ([page, str(value)])
            f.write(line + "\n" )
        



In [None]:
filename = "Crawl/Crawl.txt"

with open(filename, "a+") as f:
    for page in visited_pages_list:
        f.write( page + "\n" )
        for link in visited_pages[page]:
            f.write( "---> " + link + "\n")

In [12]:
for doc in VSM.documents:
    print doc.title
    for k,v in doc.term_frequencies.iteritems():
        print k,v

literature.offline/authors/carroll-lewis/index.html
knowledg 0.5
carroll 0.5
lewi 0.5
through 0.5
wonderland 0.5
librari 1.0
bruno 0.5
online 1.0
import 0.5
sylvie 0.5
updated 0.5
cssliteraturecss 0.5
alices 0.5
glass 0.5
ltdlast 0.5
snark 0.5
adventur 0.5
look 0.5
literatureoffline 1.0
hunt 0.5
literatur 1.0
matter 0.5
contact 0.5
sponsored 0.5
literature.offline/authors/carroll-lewis/alices-adventures-in-wonderland/index.html
origin 0.0833333333333
hunt 0.0833333333333
rabbit 0.0833333333333
actaully 0.0833333333333
queen 0.0833333333333
carroll 0.0833333333333
soon 0.0833333333333
pig 0.0833333333333
down 0.0833333333333
sourc 0.0833333333333
through 0.0833333333333
long 0.0833333333333
file 0.0833333333333
stole 0.0833333333333
wonderlandlewis 0.0833333333333
librari 0.166666666667
tale 0.0833333333333
adventur 0.0833333333333
rabbithole 0.0833333333333
caucusrace 0.0833333333333
advic 0.0833333333333
author 0.0833333333333
lobster 0.0833333333333
detail 0.0833333333333
send 0.0833

In [22]:
from collections import OrderedDict 

page_rank_old = {}



def page_rank(page_rank_old = None, damping_factor = 0.8, i = 0):
    
    #if i % 20000==0.1:
        #print "\n\nNEW ROUND OF PAGE RANK\n\n"
    
    page_rank_new = {}
    
    if not page_rank_old:
        page_rank_old = {}
        page_rank_old[seed] = 1

    for page in visited_pages_list:
        page_rank_new[page] = []

    for page in visited_pages_list:
        number_of_links = len(visited_pages[page])
        influence = page_rank_old.get(page, 0)
        for link in visited_pages[page]:
            #print "--->", link
            page_rank_new.get(link).append(damping_factor*influence/number_of_links)
        
    for page in visited_pages_list:
        #if i % 20000==0.1:
            #print page, sum(page_rank_new[page]) + (1-damping_factor)
        page_rank_new[page] = sum(page_rank_new[page]) + (1-damping_factor)
        
    return page_rank_new

dic = page_rank()

for i in xrange (15):
    dic = page_rank(dic, i=i)
    print "PAGE RANK"
    for k,v in OrderedDict(sorted(dic.iteritems(), key=lambda item: -item[1])).items()[:10]:
        print k,v

PAGE RANK
literature.offline/authors/carroll-lewis/index.html 2.79366666667
literature.offline/authors/carroll-lewis/sylvie-and-bruno/index-2.html 1.295
literature.offline/authors/carroll-lewis/through-the-looking-glass/index-2.html 0.733333333333
literature.offline/authors/carroll-lewis/alices-adventures-in-wonderland/index-2.html 0.733333333333
literature.offline/authors/carroll-lewis/the-hunting-of-the-snark/index-2.html 0.632
literature.offline/authors/carroll-lewis/sylvie-and-bruno/index.html 0.340333333333
literature.offline/authors/carroll-lewis/the-hunting-of-the-snark/index.html 0.340333333333
literature.offline/authors/carroll-lewis/alices-adventures-in-wonderland/index.html 0.340333333333
literature.offline/authors/carroll-lewis/through-the-looking-glass/index.html 0.340333333333
literature.offline/authors/carroll-lewis/the-hunting-of-the-snark/chapter-02.html 0.325333333333
PAGE RANK
literature.offline/authors/carroll-lewis/index.html 4.0807337037
literature.offline/authors