In [None]:
import requests
from bs4 import BeautifulSoup
import json

def scrape_science_gateways():
    base_url = "https://sciencegateways.org"
    start_url = f"{base_url}/resources/browse?search=&sortby=date&tag=&type=&limit=1000&limitstart=0"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
    }

    try:
        response = requests.get(start_url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"❌ Error fetching main page: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    gateways = []

    for item in soup.select('li.public'):
        name = item.select_one('p.title').get_text(strip=True) if item.select_one('p.title') else "N/A"
        category = item.select_one('p.details').get_text(strip=True) if item.select_one('p.details') else "N/A"
        abstract = item.select_one('p.result-description').get_text(strip=True) if item.select_one('p.result-description') else "N/A"
        
        detail_tag = item.select_one('p.title a')
        detail_link = f"{base_url}{detail_tag['href']}" if detail_tag and detail_tag.has_attr('href') else "N/A"
        
        additional_data = scrape_additional_details(detail_link, headers) if detail_link != "N/A" else {}

        gateways.append({
            "name": name,
            "category": category,
            "site": detail_link,
            "abstract": abstract,
            **additional_data
        })

    with open("science_gateways_extended.json", "w", encoding="utf-8") as f:
        json.dump(gateways, f, indent=4, ensure_ascii=False)

    print(f"✅ Successfully scraped {len(gateways)} entries and saved to science_gateways_extended.json")

def scrape_additional_details(url, headers):
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"⚠️ Error fetching details from {url}: {e}")
        return {}
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Extract the 'Published on' date if available
    published_tag = soup.find(lambda tag: tag.name == "p" and "Published on:" in tag.text)
    published_date = published_tag.get_text(strip=True).replace("Published on:", "").strip() if published_tag else "N/A"
    
    # Extract the external site URL
    site_tag = soup.select_one("div.resource-content a[href]")
    site_url = site_tag['href'] if site_tag else "N/A"
    
    # Extract citation text
    cite_tag = soup.select_one("ul.citations p")
    cite_text = cite_tag.text.strip() if cite_tag else "N/A"
    
    # Extract tags
    tags_section = soup.select(".tags a")
    tags = [tag.text.strip() for tag in tags_section] if tags_section else []
    
    return {
        "published_on": published_date,
        "site_url": site_url,
        "cite": cite_text,
        "tags": tags
    }

scrape_science_gateways()