In [1]:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse, parse_qs, urlunparse

In [2]:
def clean_url(url):
    parsed = urlparse(url)
    query_params = parse_qs(parsed.query)
    cleaned_query = {key: value for key, value in query_params.items() if key not in ['prev', 'pto']}
    cleaned_url = urlunparse(parsed._replace(query="&".join(f"{k}={v[0]}" for k, v in cleaned_query.items())))
    return cleaned_url

In [3]:
def save_board_tokens(board_tokens, path):
    with open(path, 'w') as file:
        for token in board_tokens:
            file.write(f"{token}\n")

In [4]:
def get_board_tokens(query, num_pages=1):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
    }
    board_tokens = set()
    base_url = "https://www.google.com/search"
    
    for page in range(num_pages):
        params = {
            "q": query,
            "start": page * 10
        }
        response = requests.get(base_url, headers=headers, params=params)
        soup = BeautifulSoup(response.text, "html.parser")
        
        for a_tag in soup.find_all("a", href=True):
            url = clean_url(a_tag["href"])
            match = re.search(r"boards\.greenhouse\.io/([^/?]+)", url)
            if match:
                board_tokens.add(match.group(1))
        
        if (page+1) % 10 == 0:
            print(f"The page {page+1} and {len(board_tokens)} broad_token") 
    
    return board_tokens

In [5]:
tokens = get_board_tokens("site:boards.greenhouse.io", num_pages = 100)

The page 10 and 95 broad_token
The page 20 and 177 broad_token
The page 30 and 244 broad_token
The page 40 and 300 broad_token
The page 50 and 300 broad_token
The page 60 and 300 broad_token
The page 70 and 300 broad_token
The page 80 and 300 broad_token
The page 90 and 300 broad_token
The page 100 and 300 broad_token


In [8]:
save_board_tokens(tokens, "board_tokens.txt")

# `Combine board_token files`

In [1]:
def combine_and_deduplicate_files(file_paths, output_path):
    combined_tokens = set()
    
    for file_path in file_paths:
        with open(file_path, 'r') as file:
            for line in file:
                combined_tokens.add(line.strip())
    
    with open(output_path, 'w') as output_file:
        for token in sorted(combined_tokens):
            output_file.write(f"{token}\n")

In [2]:
combine_and_deduplicate_files(["board_tokens_0.txt", "board_tokens_1.txt", "board_tokens_2.txt"], \
                              "board_tokens.txt")