In [1]:
!pip install serpapi beautifulsoup4 requests newspaper3k

In [8]:
import requests
from bs4 import BeautifulSoup
import re

# -----------------------------
# CONFIGURATION
# -----------------------------
SERPAPI_KEY = "79d5a10900dc6aae037bdbb5a52937fb569ad2e4"  # replace with your key
SEARCH_ENGINE = "google"

# -----------------------------
# 1. Web Search Function (SerpAPI)
# -----------------------------
def web_search(query, num_results=5):
    """Search web using SerpAPI and return top result URLs."""
    url = "https://serpapi.com/search"
    params = {
        "engine": SEARCH_ENGINE,
        "q": query,
        "api_key": SERPAPI_KEY,
        "num": num_results,
        "hl": "en",
    }
    res = requests.get(url, params=params)
    data = res.json()

    results = []
    for item in data.get("organic_results", []):
        results.append({
            "title": item.get("title"),
            "link": item.get("link"),
            "snippet": item.get("snippet")
        })
    return results


# -----------------------------
# 2. Clean Text Extraction from URL
# -----------------------------
def extract_text_from_url(url, min_length=200):
    """Extracts main content text from a web page using BeautifulSoup."""
    try:
        html = requests.get(url, timeout=10).text
        soup = BeautifulSoup(html, "html.parser")

        # Remove scripts, styles, headers, footers, navs
        for tag in soup(["script", "style", "noscript", "header", "footer", "form", "nav", "aside"]):
            tag.extract()

        text = soup.get_text(separator=" ")
        # clean multiple spaces
        text = re.sub(r"\s+", " ", text).strip()

        if len(text) >= min_length:
            return text
        else:
            return ""

    except Exception as e:
        print(f"⚠️ Error parsing {url}: {e}")
        return ""


# -----------------------------
# 3. Complete Web Retriever
# -----------------------------
def web_retriever(query, num_results=3):
    """Search the web and return cleaned content of top results."""
    search_results = web_search(query, num_results)
    pages = []

    for result in search_results:
        link = result["link"]
        text = extract_text_from_url(link)
        if text:
            pages.append({
                "title": result["title"],
                "url": link,
                "snippet": result.get("snippet", ""),
                "content": text[:5000]  # trim to avoid huge texts
            })
    return pages


# -----------------------------
# 4. Example Usage
# -----------------------------
# if __name__ == "__main__":
query = "Apple 2024 annual report summary"
results = web_retriever(query, num_results=3)

for i, r in enumerate(results):
    print(f"--- Result {i+1} ---")
    print(f"Title: {r['title']}")
    print(f"URL: {r['url']}")
    print(f"Snippet: {r['snippet']}")
    print(f"Content preview:\n{r['content'][:500]}...\n")

In [9]:
import requests
from bs4 import BeautifulSoup
import re

SERPAPI_KEY = "79d5a10900dc6aae037bdbb5a52937fb569ad2e4"  # replace with your key
SEARCH_ENGINE = "google"

def web_search(query, num_results=5):
    """Search web using SerpAPI and return top result URLs."""
    url = "https://serpapi.com/search"
    params = {
        "engine": SEARCH_ENGINE,
        "q": query,
        "api_key": SERPAPI_KEY,
        "num": num_results,
        "hl": "en",
    }

    res = requests.get(url, params=params)
    try:
        data = res.json()
    except Exception as e:
        print("⚠️ Could not parse JSON:", e)
        print(res.text)
        return []

    # Debug: print keys to see what SerpAPI returned
    print("Keys in JSON response:", data.keys())

    results = []
    if "organic_results" not in data:
        print("⚠️ No organic_results found in response")
        return []

    for item in data["organic_results"]:
        results.append({
            "title": item.get("title"),
            "link": item.get("link"),
            "snippet": item.get("snippet")
        })
    return results

def extract_text_from_url(url, min_length=200):
    try:
        html = requests.get(url, timeout=10).text
        soup = BeautifulSoup(html, "html.parser")
        for tag in soup(["script", "style", "noscript", "header", "footer", "form", "nav", "aside"]):
            tag.extract()
        text = soup.get_text(separator=" ")
        text = re.sub(r"\s+", " ", text).strip()
        if len(text) >= min_length:
            return text
        else:
            return ""
    except Exception as e:
        print(f"⚠️ Error parsing {url}: {e}")
        return ""

def web_retriever(query, num_results=3):
    search_results = web_search(query, num_results)
    if not search_results:
        print("⚠️ No search results returned")
        return []

    pages = []
    for result in search_results:
        link = result["link"]
        text = extract_text_from_url(link)
        if text:
            pages.append({
                "title": result["title"],
                "url": link,
                "snippet": result.get("snippet", ""),
                "content": text[:5000]
            })
    return pages

if __name__ == "__main__":
    query = "Apple 2024 annual report summary"
    results = web_retriever(query, num_results=3)

    if not results:
        print("⚠️ No pages retrieved")
    else:
        for i, r in enumerate(results):
            print(f"--- Result {i+1} ---")
            print(f"Title: {r['title']}")
            print(f"URL: {r['url']}")
            print(f"Snippet: {r['snippet']}")
            print(f"Content preview:\n{r['content'][:500]}...\n")


Keys in JSON response: dict_keys(['error'])
⚠️ No organic_results found in response
⚠️ No search results returned
⚠️ No pages retrieved


In [10]:
import requests
from bs4 import BeautifulSoup
import re
import urllib.parse

# -----------------------------
# 1. Get top Google search URLs
# -----------------------------
def google_search(query, num_results=5):
    """Scrape top Google search results (titles + links)."""
    query = urllib.parse.quote_plus(query)
    url = f"https://www.google.com/search?q={query}&num={num_results}"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
                      "(KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
    }
    res = requests.get(url, headers=headers)
    soup = BeautifulSoup(res.text, "html.parser")

    results = []
    for g in soup.find_all('div', class_='tF2Cxc'):
        a_tag = g.find('a')
        title_tag = g.find('h3')
        if a_tag and title_tag:
            link = a_tag['href']
            title = title_tag.get_text()
            snippet_tag = g.find('span', class_='aCOpRe')
            snippet = snippet_tag.get_text() if snippet_tag else ""
            results.append({
                "title": title,
                "link": link,
                "snippet": snippet
            })
    return results[:num_results]

# -----------------------------
# 2. Extract text from web page
# -----------------------------
def extract_text_from_url(url, min_length=200):
    """Extract main content text from a web page."""
    try:
        html = requests.get(url, timeout=10).text
        soup = BeautifulSoup(html, "html.parser")
        for tag in soup(["script", "style", "noscript", "header", "footer", "form", "nav", "aside"]):
            tag.extract()
        text = soup.get_text(separator=" ")
        text = re.sub(r"\s+", " ", text).strip()
        if len(text) >= min_length:
            return text
        else:
            return ""
    except Exception as e:
        print(f"⚠️ Error parsing {url}: {e}")
        return ""

# -----------------------------
# 3. Full Web Retriever
# -----------------------------
def web_retriever(query, num_results=3):
    search_results = google_search(query, num_results)
    if not search_results:
        print("⚠️ No search results returned")
        return []

    pages = []
    for result in search_results:
        link = result["link"]
        text = extract_text_from_url(link)
        if text:
            pages.append({
                "title": result["title"],
                "url": link,
                "snippet": result.get("snippet", ""),
                "content": text[:5000]  # trim
            })
    return pages

# -----------------------------
# 4. Example Usage
# -----------------------------
if __name__ == "__main__":
    query = "Apple 2024 annual report summary"
    results = web_retriever(query, num_results=3)

    if not results:
        print("⚠️ No pages retrieved")
    else:
        for i, r in enumerate(results):
            print(f"--- Result {i+1} ---")
            print(f"Title: {r['title']}")
            print(f"URL: {r['url']}")
            print(f"Snippet: {r['snippet']}")
            print(f"Content preview:\n{r['content'][:500]}...\n")


⚠️ No search results returned
⚠️ No pages retrieved
