In [28]:
#load libraries
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import time
import random
import pickle
import re


In [29]:
# Initialize Chrome Options
options = webdriver.ChromeOptions()
options.add_argument("--headless")  # Run in headless mode
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_experimental_option("prefs", {"profile.managed_default_content_settings.images": 2})  # Disable images


In [30]:
# Configure Chrome anti-bot measures
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
options.add_argument("--disable-blink-features=AutomationControlled")


In [31]:
# Initialize WebDriver with Chrome options
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)


In [32]:
# Define paths for cookies and initial URLs
cookie_file_path = "cookies.pkl"
base_url = "https://finance.yahoo.com/"
sectors_url = "https://finance.yahoo.com/sectors/basic-materials/"

In [33]:
# Save cookies function
def save_cookies(driver, path):
    with open(path, "wb") as file:
        pickle.dump(driver.get_cookies(), file)

In [34]:
# Load cookies function with domain check
def load_cookies(driver, path):
    try:
        with open(path, "rb") as file:
            cookies = pickle.load(file)
            for cookie in cookies:
                if "domain" in cookie and cookie["domain"] in driver.current_url:
                    driver.add_cookie(cookie)
            return True
    except FileNotFoundError:
        return False

In [35]:
# Initialize an empty list to hold industry names
industry_names = []

def gather_industry_names(driver):
    global industry_names
    # Reload sectors page if not on it
    if driver.current_url != sectors_url:
        print("Returning to sectors page...")
        driver.get(sectors_url)
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CSS_SELECTOR, "tr.yf-k3njn8"))
        )
        time.sleep(1)  # Short delay for additional page loading

    # Select industry rows after page load
    industry_rows = driver.find_elements(By.CSS_SELECTOR, "tr.yf-k3njn8")
    print(f"Found {len(industry_rows)} industries.")

    # Extract and clean names of the first 10 industries
    industry_names = [
        row.find_element(By.CSS_SELECTOR, "td.name").text for row in industry_rows[2:len(industry_rows)]
    ]
    print("Collected industry names:", industry_names)

In [36]:
def generate_urls():
    # Generate URLs using industry names, replacing spaces with dashes and removing '&'
    updated_urls = [
        sectors_url + name.lower().replace('&', '').replace(' ', '-').replace('--', '-') + '/' 
        for name in industry_names
    ]
    return updated_urls


In [37]:
def generate_urls():
    # Generate URLs using industry names, replacing spaces with dashes and removing '&'
    updated_urls = [
        sectors_url + name.lower().replace('&', '').replace(' ', '-').replace('--', '-') + '/' 
        for name in industry_names
    ]
    return updated_urls


In [38]:
# Function to extract "/quote/.../" hrefs from each URL in the list
def extract_quote_links(driver, urls):
    quote_links = []  # List to hold fully qualified extracted links

    for index, url in enumerate(urls):
        print(f"Accessing URL {index + 1}/{len(urls)}: {url}")
        driver.get(url)
        time.sleep(5)  # Delay to allow page load; adjust as needed for speed optimization

        # Parse the page source with BeautifulSoup
        soup = BeautifulSoup(driver.page_source, 'html.parser')

        # Find all hrefs that match "/quote/.../"
        matched_links = []
        for link in soup.find_all('a', href=True):
            href = link['href']
            if re.match(r"^/quote/.+/$", href):
                full_url = urljoin(base_url, href)  # Prepend base URL
                matched_links.append(full_url)

            # Stop after collecting the first 10 unique links
            if len(matched_links) >= 10:
                break

        quote_links.extend(matched_links)

        print(f"Extracted {len(matched_links)} links from {url}")

    return quote_links

In [39]:
# Main function to handle navigation
def main():
    print("Navigating to Yahoo Finance homepage...")
    driver.get(base_url)
    time.sleep(2)
    
    # Handle cookies if they exist
    if load_cookies(driver, cookie_file_path):
        print("Cookies loaded successfully.")
        driver.refresh()
    else:
        print("No cookies found. Accepting cookies manually.")
        try:
            accept_button = WebDriverWait(driver, 5).until(
                EC.element_to_be_clickable((By.XPATH, "//button[contains(text(), 'Accept all')]"))
            )
            accept_button.click()
            time.sleep(2)
            save_cookies(driver, cookie_file_path)
            print("Cookies accepted and saved.")
        except Exception as e:
            print("Error handling cookies:", e)
    
    # Navigate to Basic Materials sector
    print("Navigating to Basic Materials sector...")
    driver.get(sectors_url)
    time.sleep(2)
    
    # Navigate to each industry and collect company links
    #navigate_to_industry()
    gather_industry_names(driver)
    updated_urls = generate_urls()

    print("List of updated URLs for the first 10 industries:")
    for url in updated_urls:
        print(url)

    # Extract quote links from each updated URL
    print("Extracting /quote/.../ links from each industry page...")
    extracted_links = extract_quote_links(driver, updated_urls)
    
    print("Collected full quote links:")
    for link in extracted_links:
        print(link)


In [40]:
# Run the main function
main()

Navigating to Yahoo Finance homepage...
Cookies loaded successfully.
Navigating to Basic Materials sector...
Found 16 industries.
Collected industry names: ['Specialty Chemicals', 'Gold', 'Building Materials', 'Copper', 'Steel', 'Agricultural Inputs', 'Chemicals', 'Other Industrial Metals & Mining', 'Lumber & Wood Production', 'Aluminum', 'Other Precious Metals & Mining', 'Coking Coal', 'Paper & Paper Products', 'Silver']
List of updated URLs for the first 10 industries:
https://finance.yahoo.com/sectors/basic-materials/specialty-chemicals/
https://finance.yahoo.com/sectors/basic-materials/gold/
https://finance.yahoo.com/sectors/basic-materials/building-materials/
https://finance.yahoo.com/sectors/basic-materials/copper/
https://finance.yahoo.com/sectors/basic-materials/steel/
https://finance.yahoo.com/sectors/basic-materials/agricultural-inputs/
https://finance.yahoo.com/sectors/basic-materials/chemicals/
https://finance.yahoo.com/sectors/basic-materials/other-industrial-metals-mining

In [42]:
# Close the driver
print("Script complete. Closing the browser.")
driver.quit()

Script complete. Closing the browser.
