In [1]:
import Queue
from bs4 import BeautifulSoup
import pickle

visited_pages = {} # Dictionary of pages with value a list of links found on that page
visited_pages_list = [] # Keep trck of the order in which pages were visited

In [2]:
def get_source(old_absolute_link):
    # assert that we are visiting a new page
    try:
        assert(old_absolute_link not in visited_pages_list)
    except:
        """
            If we have visited the page already then
            we should act according to our reindexing strategy
            Since the pages are static we do not need one
        """
        print "PAGE ALREADY VISITED", old_absolute_link
        return None
    
    
    visited_pages[old_absolute_link] = [] # We will add links found on the page to this list
    visited_pages_list.append(old_absolute_link) # This list retains the order in which
    
    with open(old_absolute_link) as f:
        source = f.read()
        return source
    
seed = "literature.offline/authors/carroll-lewis/index.html"

In [3]:
def get_links(old_absolute_link, soup):
    
    # only pages with these extensions are valid crawls
    valid_pages_extensions = ['htm', 'html']
    
    def not_visited(href): return href not in visited_pages_list and href not in page_queue.queue
    
    def not_online(href):  return not (href.startswith('http:') or href.startswith('https:') or href.startswith('www.'))
    
    def get_link_extension(href): return href.split('.')[-1]
    
    def valid_crawl(href): return not_online(href) and get_link_extension(href).lower() in valid_pages_extensions
    
    def get_new_absolute_link(absolute, relative):
        """
            absolute is a link in the format a/b/c/d.html
            relative is a link with directions from the current directory
            if relative is e.html then we simply replace d.html with e.html
            giving a/b/c/e.html
            
            if relative starts with one or more ../ then we need to remove earlier directories
            a/b/c/d.html + ../../e.html
            we have to go back two directories (remove b and c)
            giving a/e.html
        """
    
        
        back_directories_counter = 0
        
        while (relative.startswith("../")):
            relative = relative[3:]
            back_directories_counter += 1
        
        components = absolute.split("/")[:-(1+back_directories_counter)]
        components.append(relative)
        new_link = "/".join(components).split('#')[0] # must check not seen
        return "/".join(components)
    
    
    this_pages_links_set = set()
    
    
    for link in soup.findAll('a'):
        
        try:
            href = link['href']
        except:
            continue
            
        new_absolute_link = get_new_absolute_link(old_absolute_link, href)
        
        if valid_crawl(href) and valid_crawl(new_absolute_link):
            # Obviously a set does not allow duplicates
            # but we want to retain order so we have to check
            # anyway because lists do allow duplicates
            if new_absolute_link not in this_pages_links_set:
                this_pages_links_set.add(new_absolute_link)
                visited_pages[old_absolute_link].append(new_absolute_link)
            # if not visited
            if not_visited(new_absolute_link):
                page_queue.put(new_absolute_link)

In [4]:
# pages to visit
page_queue = Queue.Queue()

page_queue.put(seed)

# While there are pages in the queue
# Open and read them
# Create a BeautifulSoup object and pass them to get_links to extract links
while (page_queue.qsize() > 0):
    next_page_link = page_queue.get()
    source = get_source(next_page_link)
    
    if source: # Don't revisit old pages
        soup = BeautifulSoup(source)
        get_links(next_page_link, soup)

In [5]:
filename = "Crawl/Crawl.txt"

# Prints each page and the links on that page
with open(filename, "a+") as f:
    for page in visited_pages_list:
        f.write( page + "\n" )
        for link in visited_pages[page]:
            f.write( "---> " + link + "\n")

In [7]:
# Save visited_pages_list and visited_pages for use in the other parts
pickle.dump(visited_pages, open( "vp.p", "w" ) )
pickle.dump(visited_pages_list, open( "vpl.p", "w" ) )