Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 76 additions & 0 deletions web_crawler.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Problem:

# Given a URL, crawl that webpage for URLs, and then continue crawling until you've visited all URLs.
# Return all links accessed.

# Assume you have an API with two methods that you can use:
# get_html_content(url: str) -> str:
# """returns html of the webpage of url"""
# get_links_on_page(html: str) -> list[str]:
# """returns array of the urls in the html"""


# DFS Implementation
# O(V * (E + C))
# V -> n = number of urls in stack
# E -> s = number of urls from one url (for loop line 19)
# c = length of page content

# Iterative Implementation
class WebCrawler:
def __init__(self):
self.visited_urls = set()
self.stack = []

def web_crawl(self, url: str) -> list[str]:
self.stack.append(url)

while len(self.stack) > 0:
curr_url = self.stack.pop()

if curr_url in self.visited_urls:
continue

self.visited_urls.add(curr_url)

html = self.get_html_content(curr_url)
links = self.get_links_on_page(html)

for link in links:
if link not in self.visited_urls:
self.stack.append(link)

return list(self.visited_urls)

def get_html_content(self, url: str) -> str:
# This is a placeholder for the actual implementation
return "html content"

def get_links_on_page(self, html: str) -> list[str]:
# This is a placeholder for the actual implementation
return ["link1", "link2", "link3"]

# Recursive Implementation
class WebCrawlerRecursive:
def __init__(self):
self.visited_urls = set()

def web_crawl(self, url: str) -> list[str]:
if url in self.visited_urls:
return list(self.visited_urls)

self.visited_urls.add(url)
html = self.get_html_content(url)
links = self.get_links_on_page(html)
for link in links:
if link not in self.visited_urls:
self.web_crawl(link)
return list(self.visited_urls)

def get_html_content(self, url: str) -> str:
# This is a placeholder for the actual implementation
return "html content"

def get_links_on_page(self, html: str) -> list[str]:
# This is a placeholder for the actual implementation
return ["link1", "link2", "link3"]