diff --git a/web_crawler.py b/web_crawler.py new file mode 100644 index 0000000..2f8c620 --- /dev/null +++ b/web_crawler.py @@ -0,0 +1,76 @@ +# Problem: + +# Given a URL, crawl that webpage for URLs, and then continue crawling until you've visited all URLs. +# Return all links accessed. + +# Assume you have an API with two methods that you can use: +# get_html_content(url: str) -> str: +# """returns html of the webpage of url""" +# get_links_on_page(html: str) -> list[str]: +# """returns array of the urls in the html""" + + +# DFS Implementation +# O(V * (E + C)) +# V -> n = number of urls in stack +# E -> s = number of urls from one url (for loop line 19) +# c = length of page content + +# Iterative Implementation +class WebCrawler: + def __init__(self): + self.visited_urls = set() + self.stack = [] + + def web_crawl(self, url: str) -> list[str]: + self.stack.append(url) + + while len(self.stack) > 0: + curr_url = self.stack.pop() + + if curr_url in self.visited_urls: + continue + + self.visited_urls.add(curr_url) + + html = self.get_html_content(curr_url) + links = self.get_links_on_page(html) + + for link in links: + if link not in self.visited_urls: + self.stack.append(link) + + return list(self.visited_urls) + + def get_html_content(self, url: str) -> str: + # This is a placeholder for the actual implementation + return "html content" + + def get_links_on_page(self, html: str) -> list[str]: + # This is a placeholder for the actual implementation + return ["link1", "link2", "link3"] + +# Recursive Implementation +class WebCrawlerRecursive: + def __init__(self): + self.visited_urls = set() + + def web_crawl(self, url: str) -> list[str]: + if url in self.visited_urls: + return list(self.visited_urls) + + self.visited_urls.add(url) + html = self.get_html_content(url) + links = self.get_links_on_page(html) + for link in links: + if link not in self.visited_urls: + self.web_crawl(link) + return list(self.visited_urls) + + def get_html_content(self, url: str) -> str: + # This is a placeholder for the actual implementation + return "html content" + + def get_links_on_page(self, html: str) -> list[str]: + # This is a placeholder for the actual implementation + return ["link1", "link2", "link3"] \ No newline at end of file