# Obtain article titles and metadata from a given query (i.e. race)

In [3]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import pandas as pd

# Function to start the driver
def start_driver(page):
    print("Starting the WebDriver.")
    service = Service('/usr/local/bin/chromedriver')
    driver = webdriver.Chrome(service=service)
    driver.get(page)
    time.sleep(2)  # Wait for the page to load
    print("WebDriver started and page loaded.")
    return driver

def extract_titles(driver):
    titles = []
    # Locate all <li> elements with the class 'ais-InfiniteHits-item' (each representing an article)
    print("Extracting titles from the page.")
    result_elements = driver.find_elements(By.CSS_SELECTOR, 'li.ais-InfiniteHits-item')  # Each article is inside this li
    
    for element in result_elements:
        try:
            # Find the title inside the span with class 'sr-only'
            title_element = element.find_element(By.CSS_SELECTOR, 'span.sr-only')
            title = title_element.text.strip()  # Extract text and strip any extra whitespace
            titles.append(title)
        except Exception as e:
            print("Error extracting title:", e)
    
    print(f"Found {len(titles)} titles.")
    return titles


# Function to scrape all titles from the page
def scrape_all_titles(query):
    page = f"https://www.brookings.edu/?s={query}"
    driver = start_driver(page)
    
    # Step 1: Reach the bottom of the page by loading more results until no more
    while True:
        try:
            print("Looking for 'Show More' button.")
            show_more_button = driver.find_element(By.CSS_SELECTOR, 'button.ais-InfiniteHits-loadMore')  # Adjust the selector
            
            # Wait until the button is clickable
            print("Waiting for 'Show More' button to be clickable.")
            WebDriverWait(driver, 5).until(EC.element_to_be_clickable(show_more_button))
            
            print("Clicking 'Show More' button.")
            show_more_button.click()
            time.sleep(2)  # Wait for new results to load
            print("Clicked 'Show More' and waiting for more results.")
        except Exception as e:
            # If the 'Show More' button isn't found or is no longer clickable, we assume we've reached the bottom
            print("No more results or error occurred:", e)
            break
    
    # Step 2: Now that all results are loaded, extract titles
    print("Extracting titles from all pages.")
    titles = extract_titles(driver)
    
    # Step 3: Convert the list of titles to a Pandas DataFrame
    df = pd.DataFrame(titles, columns=["Search Results"])
    
    # Display the DataFrame
    print(f"Displaying {len(df)} search results.")
    print(df)  # Display the DataFrame in the console
    
    # Close the driver
    driver.quit()

# Define the query and call the function
query = "lamppost"  # Example query
scrape_all_titles(query)


Starting the WebDriver.
WebDriver started and page loaded.
Looking for 'Show More' button.
Waiting for 'Show More' button to be clickable.
Clicking 'Show More' button.
Clicked 'Show More' and waiting for more results.
Looking for 'Show More' button.
Waiting for 'Show More' button to be clickable.
No more results or error occurred: Message: 

Extracting titles from all pages.
Extracting titles from the page.
Found 13 titles.
Displaying 13 search results.
                                       Search Results
0   The lamppost theory: Why economic policy so of...
1   Restoring non-discrimination to the 21st centu...
2   10 economic characteristics of refugee arrival...
3   Rigorous preschool research illuminates policy...
4                            (re)Searching for impact
5   Is the U.S. military's futurism obsession hurt...
6   The Missing “One-Offs”: The Hidden Supply of H...
7                         Somebody Turn on the Lights
8         Libya's Muslim Brotherhood Faces the Future
9 

In [None]:
!pip install ace_tools