In [1]:
%pip install selenium webdriver-manager pandas


Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import necessary libraries
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time
import re
from tqdm import tqdm

In [3]:
# Load the CSV file containing grant URLs
input_csv_path = '/Users/miguelemb/Downloads/Projects/Grant scrapper/data/seed_url/Business Incubator Grants (1).csv'
grants_df = pd.read_csv(input_csv_path)

In [4]:
# Function to clean and extract URLs from the 'OPPORTUNITY NUMBER' column
def clean_url(cell):
    match = re.search(r'\"(http[^"]+)\"', cell)
    if match:
        return match.group(1)
    return None

grants_df['URL'] = grants_df['OPPORTUNITY NUMBER'].apply(clean_url)

In [5]:
# Drop rows with missing URLs
grants_df = grants_df.dropna(subset=['URL'])

In [6]:
# Set up Selenium WebDriver
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options)


In [7]:
# Function to extract all visible text from a webpage
def extract_all_text(url):
    try:
        driver.get(url)
        time.sleep(1)  # Allow time for the page to load
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        
        # Extract all text from the body of the page
        body = soup.find('body')
        if body:
            text = body.get_text(separator=' ', strip=True)
        else:
            text = ""
        
        return text
    except Exception as e:
        print(f"Error extracting text from {url}: {e}")
        return ""


# Loop through each grant URL and extract data
start_time = time.time()
grants_data = []
for idx, row in enumerate(tqdm(grants_df.iterrows(), total=len(grants_df), desc="Processing URLs")):
    url = row[1]['URL']
    
    # Extract all available text from the grant page
    page_text = extract_all_text(url)
    
    # Store the extracted data, retaining original columns
    grants_data.append({
        'OPPORTUNITY NUMBER': row[1].get('OPPORTUNITY NUMBER', None),
        'OPPORTUNITY TITLE': row[1].get('OPPORTUNITY TITLE', None),
        'AGENCY CODE': row[1].get('AGENCY CODE', None),
        'OPPORTUNITY STATUS': row[1].get('OPPORTUNITY STATUS', None),
        'POSTED DATE': row[1].get('POSTED DATE', None),
        'CLOSE DATE': row[1].get('CLOSE DATE', None),
        'URL': url,
        'Extracted Text': page_text
    })


Processing URLs: 100%|██████████| 510/510 [10:28<00:00,  1.23s/it]


In [8]:
# Convert the scraped data into a DataFrame
scraped_grants_df = pd.DataFrame(grants_data)
# Save the extracted data to a new CSV file
output_csv_path = '/Users/miguelemb/Downloads/Projects/Grant scrapper/data/scraped_data/scraped_all_grants_2.csv'
scraped_grants_df.to_csv(output_csv_path, index=False)

# Close the Selenium WebDriver
driver.quit()

# Print the total time taken
end_time = time.time()
print(f"Total time taken: {end_time - start_time:.2f} seconds")

Total time taken: 628.58 seconds
