In [1]:
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime

def scrape_science_gateways():
    base_url = "https://sciencegateways.org"
    start_url = f"{base_url}/resources/browse?search=&sortby=date&tag=&type=&limit=1000&limitstart=0"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36"
    }

    try:
        response = requests.get(start_url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"❌ Error fetching main page: {e}")
        return

    soup = BeautifulSoup(response.text, 'html.parser')
    gateways = []

    for item in soup.select('li.public'):
        name = item.select_one('p.title')
        category = item.select_one('p.details')
        abstract = item.select_one('p.result-description')
        detail_tag = item.select_one('p.title a')

        # Extract text if element exists, else None
        name_text = name.get_text(strip=True) if name else None
        category_text = category.get_text(strip=True) if category else None
        abstract_text = abstract.get_text(strip=True) if abstract else None
        detail_link = f"{base_url}{detail_tag['href']}" if detail_tag and detail_tag.has_attr('href') else None
        
        # Scrape additional details if link exists
        additional_data = scrape_additional_details(detail_link, headers) if detail_link else {}

        gateways.append({
            "name": name_text,
            "category": category_text,
            "abstract": abstract_text,
            "site": detail_link,
            **additional_data,
            "date_scraped": datetime.utcnow().isoformat()  # Timestamp for tracking
        })

    with open("science_gateways_extended.json", "w", encoding="utf-8") as f:
        json.dump(gateways, f, indent=4, ensure_ascii=False)

    print(f"✅ Successfully scraped {len(gateways)} entries and saved to 'science_gateways_extended.json'.")

def scrape_additional_details(url, headers):
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"⚠️ Error fetching details from {url}: {e}")
        return {}
    
    soup = BeautifulSoup(response.text, 'html.parser')

    # Extract the 'Published on' date
    published_tag = soup.select_one("div.resource-metadata p")
    published_date = published_tag.get_text(strip=True).replace("Published on:", "").strip() if published_tag else None

    # Extract external site URL and ensure it's a full link
    site_tag = soup.select_one("div.resource-content a[href]")
    site_url = site_tag['href'] if site_tag and site_tag['href'].startswith("http") else None

    # Extract citation text
    cite_tag = soup.select_one("ul.citations p")
    cite_text = cite_tag.get_text(strip=True) if cite_tag else None

    # Extract tags
    tags_section = soup.select(".tags a")
    tags = [tag.get_text(strip=True) for tag in tags_section] if tags_section else None

    return {
        "published_on": published_date,
        "external_site": site_url,
        "citation": cite_text,
        "tags": tags
    }

scrape_science_gateways()


✅ Successfully scraped 636 entries and saved to 'science_gateways_extended.json'.


In [2]:
import json
import os
import spacy
import speech_recognition as sr
from rapidfuzz import process
from openai import OpenAI

# Load NLP model
nlp = spacy.load("en_core_web_sm")

# Load Science Gateways data
try:
    with open("science_gateways_optimized.json", "r", encoding="utf-8") as file:
        gateways = json.load(file)
except FileNotFoundError:
    print("❌ Error: The data file 'science_gateways_optimized.json' was not found.")
    exit()
except json.JSONDecodeError:
    print("❌ Error: The data file is corrupted or not in JSON format.")
    exit()

# Initialize OpenAI client
client = OpenAI(
  api_key=""
)  # Read API key from environment

# Function to search Science Gateway dataset
def search_gateways(query):
    if not query:
        return []

    query = query.lower().strip()

    # Extract fields for fuzzy matching
    names = [(g["name"], i) for i, g in enumerate(gateways) if g.get("name")]
    categories = [(g["category"], i) for i, g in enumerate(gateways) if g.get("category")]
    abstracts = [(g["abstract"], i) for i, g in enumerate(gateways) if g.get("abstract")]

    # Find best matches using rapidfuzz
    best_name_match = process.extract(query, [n[0] for n in names], limit=5)
    best_category_match = process.extract(query, [c[0] for c in categories], limit=5)
    best_abstract_match = process.extract(query, [a[0] for a in abstracts], limit=5)

    results = []
    for match in best_name_match + best_category_match + best_abstract_match:
        if isinstance(match, tuple) and len(match) == 3:
            match_text, score, index = match
            if score > 60:
                results.append(gateways[index])

    return results

# Function to interact with GPT-3.5 Turbo (enhanced with Science Gateway data)
def gpt_response(user_message, conversation_history):
    # Step 1: Try to find a relevant match in the Science Gateway dataset
    matches = search_gateways(user_message)

    if matches:
        relevant_info = "\n".join([
            f"🔹 **{g['name']}**\n🏷 **Category**: {g.get('category', 'N/A')}\n📅 **Published On**: {g.get('published_on', 'Unknown')}\n📝 **Abstract**: {g.get('abstract', 'N/A')[:300]}...\n🌍 **Website**: {g.get('site', 'N/A')}\n"
            for g in matches[:3]  # Include top 3 relevant results
        ])
        system_instruction = "Use the following science gateway information to respond to the user’s question."
        prompt = f"{system_instruction}\n\n{relevant_info}\n\nUser query: {user_message}"
    else:
        prompt = f"User query: {user_message}"
        print("🤖 No matching Science Gateways found. I'll provide a general response.")

    # Step 2: Format conversation history for context-aware responses
    messages = [
        {"role": "system", "content": "You are a knowledgeable AI assistant specialized in science research tools and resources."}
    ]
    messages.extend(conversation_history)  # Include past messages
    messages.append({"role": "user", "content": prompt})  # Add current user query

    # Step 3: Query OpenAI GPT-3.5 Turbo
    try:
        response = client.chat.completions.create(
            model="gpt-3.5-turbo",
            messages=messages,
            max_tokens=250
        )
        bot_response = response.choices[0].message.content.strip()
    except Exception as e:
        bot_response = f"Sorry, an error occurred: {str(e)}"
    
    # Step 4: Store the conversation context
    conversation_history.append({"role": "user", "content": user_message})
    conversation_history.append({"role": "assistant", "content": bot_response})

    return bot_response

# Function to get voice input with retries
def get_voice_input(max_retries=3):
    recognizer = sr.Recognizer()
    for attempt in range(max_retries):
        with sr.Microphone() as mic:
            print("\n🎙 Speak now...")
            recognizer.adjust_for_ambient_noise(mic)
            try:
                audio = recognizer.listen(mic, timeout=8)
                text = recognizer.recognize_google(audio)
                print(f"🗣 You said: {text}\n")
                return text.lower()
            except sr.UnknownValueError:
                print("❌ Sorry, I couldn't understand. Please try again.")
            except sr.RequestError:
                print("⚠️ Error connecting to the speech service. Try again later.")
    print("❌ Max retries reached. Switching to text input.")
    return None

# Start chatbot
conversation_history = []  # Store past messages for context-aware responses

print("🤖 Welcome to the Science Gateway AI Chatbot! Type 'exit' to quit.")
while True:
    user_input = input("\n📝 Type your message OR 🎙 Say 'voice' to speak: ").strip()

    if user_input.lower() == "exit":
        print("👋 Goodbye! See you next time!")
        break

    if user_input.lower() == "voice":
        user_query = get_voice_input()
    else:
        user_query = user_input  # Accept the text input directly

    if not user_query:
        continue

    # Get AI response (uses Science Gateway data when relevant)
    bot_response = gpt_response(user_query, conversation_history)
    
    print(f"\n🤖 **Science Gateway AI**: {bot_response}\n")

🤖 Welcome to the Science Gateway AI Chatbot! Type 'exit' to quit.

📝 Type your message OR 🎙 Say 'voice' to speak: What are some cool science gateways?

🤖 **Science Gateway AI**: Here are some cool science gateways that you may find interesting:

1. **dREG gateway**
   - **Category**: Science Gateways
   - **Abstract**: The dREG gateway enables users to identify the location of promoters and enhancers using PRO-seq, GRO-seq, or ChRO-seq data, focusing on transcriptional regulatory elements (TREs) in genomes.
   - **Website**: [dREG gateway](https://sciencegateways.org/resources/9610)

2. **GeoGateway**
   - **Category**: Science Gateways
   - **Abstract**: GeoGateway is a data product search and analysis gateway that facilitates scientific discovery, field use, and disaster response by integrating NASA geodetic imaging products with earthquake-related datasets and models.
   - **Website**: [GeoGateway](https://sciencegateways.org/resources/9717)

3. **COSMIC2 Science Gateway**
   - **Ca