In [1]:
import time
import random
import tqdm
import requests
from bs4 import BeautifulSoup

In [2]:
from async_requests import fetch_all_urls

In [3]:
base_url = "https://kubernetes.io"

## Home Page
We first issue a `GET` request to the home page which will provide us with an initial starting point to navigate to subsequent pages.

In [4]:
res = requests.get(f"{base_url}/docs/home/")
if res.status_code != 200:
    print("Error getting page content")

In [5]:
soup = BeautifulSoup(res.content, "html.parser")
# soup.text

## Extract Page URLs
All the pages are listed in a sidebar. We simply get it as an unordered list `<ul>`, then find all the hyperlink elements `<a>` that match a particular class. We then issue a `GET` request for each.

In [6]:
side_bar = soup.find("ul", class_="ul-1")

In [7]:
first_node = side_bar.findChild("a")
all_link_nodes = first_node.findAllNext("a", 
                                        class_="align-left pl-0 td-sidebar-link td-sidebar-link__page")

In [24]:
full_urls = [base_url + link.attrs["href"] for link in all_link_nodes]

# ASYNCHRONOUS APPROACH: Considerably faster
results = await fetch_all_urls(full_urls, max_concurrent=50)

In [27]:
pages_content = (BeautifulSoup(result.response, "html.parser").text for result in results)

In [18]:
# # Synchronous Approach: GLacial
# pages_content = []
# failure_logs = []

# for link in tqdm.tqdm(all_link_nodes):
#     full_url = base_url + link.attrs["href"]
#     res = requests.get(full_url)

#     if res.status_code != 200:
#         status_text = f"Error getting content for page: {full_url}" 
#         print(status_text)
#         failure_logs.append(status_text)

#     soup = BeautifulSoup(res.content, "html.parser")
#     pages_content.append(soup.text)

#     time.sleep(random.uniform(0.5, 1))

In [28]:
with open("data/kubernetes_docs.txt", "w", encoding="utf-8") as fp:
    for page in pages_content:
        fp.write(page)
        fp.write("\n\n\n")

# with open("data/error_log.txt", "w") as fp:
#     fp.writelines(failure_logs)