#### --- *Web Crawler: Traversing a Web Graph* ---

In [2]:
import urllib.request
from bs4 import BeautifulSoup
from collections import deque
import string
import webbrowser

In [267]:
# Helper function 1

def getLinks(href): 
    # Retrieve HTML document in a given url
    preUrl = 'https://people.ischool.berkeley.edu/~chuang/i206/b5/'
    webUrl  = urllib.request.urlopen(preUrl+href)
    webData = webUrl.read()
    
    # Parse data into BeautifulSoup object
    data = BeautifulSoup(webData, 'html.parser')
    
    # Find all links in the url
    links = data.find_all('a', href=True)
    pages = [] 
    
    for link in links:
        pages.append(link['href'])
    
    return pages

In [268]:
# Traverses a web graph consisting of a self-contained set of linked web pages

def webCrawler(s):
    # initialize queue and visited links to contain start url s
    queue = deque([s])
    visited = [s]
    
    # initialize number of links found
    links_found_cnt = 0
    
    while queue: # while queue is not empty
        cur = queue.popleft() # remove vertex cur
        cur_neighbours = getLinks(cur) # get all the links on current page
        links_found_cnt += len(cur_neighbours)
        
        for neighbour in cur_neighbours:       
            if neighbour not in visited:
                visited.append(neighbour)
                queue.append(neighbour)
    
    pages_crawled_cnt = len(visited)
    
    return visited, pages_crawled_cnt, links_found_cnt

In [269]:
startPage = 'index.html'
webCrawler(startPage)

(['index.html',
  'information.html',
  'Berkeley.html',
  'ISchool.html',
  'MIMS.html',
  'CityOfBerkeley.html',
  'UCBerkeley.html',
  'BerkeleyCollege.html',
  'SouthHall.html',
  'Campanile.html'],
 10,
 22)

#### --- *Indexing Web Pages* ---

In [270]:
# Helper function 2

def getText(href):
    # Retrieve HTML document in a given url
    preUrl = 'https://people.ischool.berkeley.edu/~chuang/i206/b5/'
    webUrl  = urllib.request.urlopen(preUrl+href)
    webData = webUrl.read()
    
    # Parse data into BeautifulSoup object
    data = BeautifulSoup(webData, 'html.parser')
    
    # Extract and process words from the page
    text = data.get_text().translate(str.maketrans('', '', string.punctuation)).lower()
    
    return text

In [271]:
# Builds an inverted index based on the words found on each web page

def webCrawlIndexer():
    # Initialize inverted index
    inv_index = {}
    
    # Get and iterate through all the web pages
    pages = webCrawler(startPage)[0]
    
    for page in pages: 
        text = getText(page)
        text_list = text.split()
        
        for word in text_list:
            if word not in inv_index:               
                inv_index[word] = set()
            inv_index[word].add(page)

    entries_cnt = len(inv_index)
    
    return entries_cnt, inv_index

In [272]:
webCrawlIndexer()

(460,
 {'206': {'index.html'},
  'crawler': {'index.html'},
  'home': {'Berkeley.html',
   'BerkeleyCollege.html',
   'Campanile.html',
   'CityOfBerkeley.html',
   'ISchool.html',
   'MIMS.html',
   'SouthHall.html',
   'UCBerkeley.html',
   'index.html',
   'information.html'},
  'page': {'index.html', 'information.html'},
  'the': {'BerkeleyCollege.html',
   'Campanile.html',
   'CityOfBerkeley.html',
   'ISchool.html',
   'MIMS.html',
   'SouthHall.html',
   'UCBerkeley.html',
   'index.html',
   'information.html'},
  'where': {'index.html'},
  'any': {'UCBerkeley.html', 'index.html'},
  'information': {'ISchool.html',
   'MIMS.html',
   'index.html',
   'information.html'},
  'can': {'Berkeley.html', 'index.html'},
  'be': {'Berkeley.html', 'index.html'},
  'found': {'index.html'},
  'is': {'BerkeleyCollege.html',
   'Campanile.html',
   'CityOfBerkeley.html',
   'ISchool.html',
   'MIMS.html',
   'SouthHall.html',
   'UCBerkeley.html',
   'index.html'},
  'homepage': {'index.htm

In [273]:
inv_index = webCrawlIndexer()[1]

#### --- *Search Query Interface* ---

In [274]:
# Prompt users to enter a search query term
while True:
    query = input("Please enter your search query term: ").lower()
    
    # Quit the interface if the user enters 'q'
    if query == 'q':
        break 
    else:
        # Print a list of web pages corresponding to the query term if it exists in the inverted index
        if query in inv_index:
            print(inv_index[query])
        # Print "No results found" if it does not exist
        else:
            print("No results found")

Please enter your search query term: 206
{'index.html'}
Please enter your search query term: berkeley
{'UCBerkeley.html', 'information.html', 'ISchool.html', 'Berkeley.html', 'CityOfBerkeley.html', 'BerkeleyCollege.html'}
Please enter your search query term: information
{'information.html', 'MIMS.html', 'ISchool.html', 'index.html'}
Please enter your search query term: random
No results found
Please enter your search query term: q


#### --- *Search Results Webpage* ---

In [3]:
# Construct and display a search results webpage that shows a list of web pages that contain the search term
webbrowser.open("file:///Users/gitcat/Documents/Academic/UC%20Berkeley/22%20Fall/INFO%20206B%20-%20Introduction%20to%20Data%20Structures%20and%20Analytics/Assignments/A5/search_results.html")

True