In [1]:
import time

In [2]:
import requests
from bs4 import BeautifulSoup

# Send a GET request to the project listing URL
base_url = "https://serdp-estcp.org/projects/listing"
page_limit = 12  # Number of projects per page
total_projects = 5000  # Total number of projects to fetch

project_links = []

# Iterate through the pages
for page_offset in range(0, total_projects, page_limit):
    size_before = len(project_links)  # Store the current size of the list
    # Prepare the URL with appropriate parameters
    params = {
        "ResourceType": "node--project",
        "DropdownSelectedValuesDictionary": "[FiscalYear, -1]",
        "DropdownSelectedValuesDictionary": "[Program, -1]",
        "DropdownSelectedValuesDictionary": "[FocusAreaFilterTerm, -1]",
        "AdditionalParameters": "[FocusAreaFilterTitle, All Focus Areas]",
        "FilterGroups": "Sepub.Infrastructure.Drupal.Models.ApiFilterGroup",
        "Sort": "-field_publish_date",
        "Sort": "title",
        "PageLimit": page_limit,
        "PageOffset": page_offset,
    }
    response = requests.get(base_url, params=params)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content
        soup = BeautifulSoup(response.content, "html.parser")

        # Find all <div> tags with the specified class attribute
        project_divs = soup.find_all('div', class_='col-md-3 mb-1 card-project')

        # Extract href values from each div
        for div in project_divs:
            # Find the <a> tag within the div
            a_tag = div.find('a')
            if a_tag:
                href = a_tag.get('href')
                project_links.append(href)
    else:
        print("Request failed with status code:", response.status_code)
        break  # Stop iterating if the request fails
        
    size_after = len(project_links)  # Get the new size of the list

    # Check if no new links were added
    if size_after == size_before:
        print("No new links found. Stopping the loop.")
        break
        
    print('waiting 2 sec, size of project_links:',len(project_links))    
    #time.sleep(2)
# Print the extracted href values
for link in project_links:
    print(link)

waiting 2 sec, size of project_links: 12
waiting 2 sec, size of project_links: 24
waiting 2 sec, size of project_links: 36
waiting 2 sec, size of project_links: 48
waiting 2 sec, size of project_links: 60
waiting 2 sec, size of project_links: 72
waiting 2 sec, size of project_links: 84
waiting 2 sec, size of project_links: 96
waiting 2 sec, size of project_links: 108
waiting 2 sec, size of project_links: 120
waiting 2 sec, size of project_links: 132
waiting 2 sec, size of project_links: 144
waiting 2 sec, size of project_links: 156
waiting 2 sec, size of project_links: 168
waiting 2 sec, size of project_links: 180
waiting 2 sec, size of project_links: 192
waiting 2 sec, size of project_links: 204
waiting 2 sec, size of project_links: 216
waiting 2 sec, size of project_links: 228
waiting 2 sec, size of project_links: 240
waiting 2 sec, size of project_links: 252
waiting 2 sec, size of project_links: 264
waiting 2 sec, size of project_links: 276
waiting 2 sec, size of project_links: 288


In [3]:
def extract_metadata(url):
    print('waiting 0.5 sec before url:', url)
    print('') 
    #time.sleep(0.5)
    # Send a GET request to the webpage
    response = requests.get('https://serdp-estcp.org' + url)

    # Check if the request was successful
    if response.status_code == 200:
        # Parse the HTML content using BeautifulSoup
        soup = BeautifulSoup(response.content, 'html.parser')

        # Extract the title or provide a default value if it doesn't exist
        title_element = soup.find('h2', {'class': 'text-white'})
        title = title_element.get_text(strip=True) if title_element else "Title Not Found"

        # Extract the metadata
        canonical_link = soup.find('link', {'rel': 'canonical'}).get('href')
        description = soup.find('meta', {'name': 'description'}).get('content')
        keywords = soup.find('meta', {'name': 'keywords'}).get('content')
        custom_meta = soup.find('meta', {'name': 'searchgov_custom1'}).get('content')
        content_type = soup.find('meta', {'name': 'dc.type'}).get('content')
        published_time = soup.find('meta', {'property': 'article:published_time'}).get('content')
        modified_time = soup.find('meta', {'property': 'article:modified_time'}).get('content')

        # Return the extracted metadata as a dictionary
        metadata = {
            'Title': title,
            'Canonical Link': canonical_link,
            'Description': description,
            'Keywords': keywords,
            'Custom Meta': custom_meta,
            'Content Type': content_type,
            'Published Time': published_time,
            'Modified Time': modified_time
        }
        return metadata
    else:
        print('Failed to retrieve the webpage. Status code:', response.status_code)
        return None

In [4]:
extract_metadata(project_links[0])

waiting 0.5 sec before url: /projects/details/24c3d03c-ae90-4ab8-b5ea-645bec94e235/weather-effects-on-the-lifecycle-of-dod-equipment-replacement-welder-a-plug-in-for-the-builder-sustainment-management-system



{'Title': 'Weather Effects on the Lifecycle of DoD Equipment Replacement (WELDER): A Plug-in for the BUILDER Sustainment Management System',
 'Canonical Link': 'https://serdp-estcp.mil/projects/details/24c3d03c-ae90-4ab8-b5ea-645bec94e235',
 'Description': 'Given increasing threats of extreme weather events, facility planners and policymakers need state-of-the-art information that projects long-term environmental risk and informs how these events may alter the replacement schedules and the performance profiles of individual facilities and their constituent systems and components. The BUILDER Sustainment Management System—the lifecycle management tool used by the U.S. Department of Defense (DoD) to consistently and comprehensively assess and forecast facility conditions—does not currently consider vulnerability to extreme weather events. This project will develop an application programming interface (API) plug-in for BUILDER that allows users to visualize weather event projections and r

In [5]:
len(project_links)

2660

In [6]:
# Loop over the project links and extract metadata
metadata_list = []
for index, link in enumerate(project_links):
    metadata = extract_metadata(link)
    if metadata:
        metadata_list.append(metadata)
        print(f"Metadata extracted for link at index {index}")
    else:
        print(f"NO METADATA FOUND for link at index {index}")

waiting 0.5 sec before url: /projects/details/24c3d03c-ae90-4ab8-b5ea-645bec94e235/weather-effects-on-the-lifecycle-of-dod-equipment-replacement-welder-a-plug-in-for-the-builder-sustainment-management-system

Metadata extracted for link at index 0
waiting 0.5 sec before url: /projects/details/3d41c4a2-bdb0-4402-beb4-f3dcf89f250f/cr20-5050-project-overview

Metadata extracted for link at index 1
waiting 0.5 sec before url: /projects/details/868266af-2001-4750-a285-db4b0cf84991/cr20-5175-project-overview

Metadata extracted for link at index 2
waiting 0.5 sec before url: /projects/details/da39a17e-a0a1-482a-84ce-0c6ea9794374/cr20-5303-project-overview

Metadata extracted for link at index 3
waiting 0.5 sec before url: /projects/details/f3f398cb-c91a-4199-b5e5-f508e321adb8/cr21-5028-project-overview

Metadata extracted for link at index 4
waiting 0.5 sec before url: /projects/details/cdfae63a-4665-4921-b5cb-28b87254bf8d/fit-for-purpose-an-integrative-assessment-of-state-of-the-science-dow

In [7]:
metadata_list_bck = metadata_list

In [8]:
import csv

# Specify the file name
file_name = 'metadata_v2.csv'

# Write data to a CSV file
with open(file_name, mode='w', newline='', encoding='utf-8') as file:
    # Assuming all dictionaries have the same keys, use the keys of the first dictionary as field names
    fieldnames = metadata_list[0].keys()
    writer = csv.DictWriter(file, fieldnames=fieldnames)

    # Write the header
    writer.writeheader()

    # Write the rows
    for metadata in metadata_list:
        writer.writerow(metadata)

print(f'Data successfully written to {file_name}')


Data successfully written to metadata_v2.csv


In [10]:
metadata_list

[]

In [11]:
#project_links[0]
#response = requests.get('https://serdp-estcp.org/projects/details/24c3d03c-ae90-4ab8-b5ea-645bec94e235')
#soup = BeautifulSoup(response.content, 'html.parser')