# Wipe notebook - just debugging stuff



In [9]:
# Delete existing web-search-sdk checkout in Colab
import subprocess, sys, pathlib, os, shutil, errno

REPO_DIR = pathlib.Path("/content/web-search-sdk")

# --- SAFETY GUARD: Multiple checks to ensure we're in Google Colab ---
def is_google_colab():
    """Check if we're running in Google Colab environment"""
    # Method 1: Check if google.colab module is available
    try:
        import google.colab
        return True
    except ImportError:
        pass
    
    # Method 2: Check if google.colab is in sys.modules
    if "google.colab" in sys.modules:
        return True
    
    # Method 3: Check environment variables
    import os
    if os.environ.get('COLAB_GPU') or os.environ.get('COLAB_TPU'):
        return True
    
    # Method 4: Check if we're in /content directory (Colab's working dir)
    if str(pathlib.Path.cwd()).startswith('/content'):
        return True
    
    return False

if not is_google_colab():
    raise RuntimeError(
        "❌ SAFETY CHECK FAILED: This notebook is NOT running in Google Colab!\n"
        "• Current working directory: " + str(pathlib.Path.cwd()) + "\n"
        "• sys.modules contains google.colab: " + str("google.colab" in sys.modules) + "\n"
        "• Environment variables: COLAB_GPU=" + str(os.environ.get('COLAB_GPU', 'None')) + 
        ", COLAB_TPU=" + str(os.environ.get('COLAB_TPU', 'None')) + "\n"
        "Refusing to delete or modify any local files. This cell is for Colab only."
    )

print("✅ SAFETY CHECK PASSED: Running in Google Colab - proceeding with cleanup")
---------------------------------------------------------------------------
if REPO_DIR.exists():
    print("Removing:", REPO_DIR)
    # force-remove, ignore errors
    subprocess.call(["rm", "-rf", str(REPO_DIR)])

# sanity-check
print("Exists after delete?", REPO_DIR.exists())
os.system("ls -la /content | head")  # show /content root

SyntaxError: invalid syntax (60320609.py, line 42)

#Web-Search SDK — End-to-End Demo

Welcome!  This notebook shows how to install, configure, and use the **web-search-sdk** to pull public web signals in just a few lines of code.



**What you can do here**
1. 🌐 Search via DuckDuckGo (links, keywords, structured results)  
2. 📰 Fetch fresh headlines with Google News RSS  
3. 📝 Extract clean text from any article or URL
4. 📚 Query knowledge bases (Wikipedia, RelatedWords)  
5. 🛠️ Leverage utilities: tokenisation, rate-limiting, parallel scraping  
6. 🔄 Combine everything into an easy and fast mini pipeline (search → extract → sentiment-ready text)

<br>

**How it works**\
• All helpers are **async** coroutines — use `await` or `asyncio.gather`.  
• Network behaviour is controlled via a lightweight `ScraperContext`.  
• For tricky sites the SDK auto-launches a stealth Playwright (and falls back to Selenium if needed).  
• Results are returned as plain Python dicts / lists for immediate analysis.

<br>

**Quickstart**
1. Run the next cell (“Environment Setup”) — it cleans the workspace, clones the latest repo version, installs the package and browser binaries.  
2. Execute each demo section in order: **Search → News → Article Extraction → Knowledge Banks → Utilities**.  
3. Use or adapt any cell for your own projects.

<br>


<small><i>Tip: set `OFFLINE_MODE=1` to run against fixture HTML only (CI / airplane mode).</i></small>

# 🔧 Environment Setup
This cell makes the notebook **stand-alone** when run on Colab or any fresh VM.
It clones the repo (if missing) and installs the minimal Python packages
required for the demo. Safe to re-run — it skips work if everything is ready.

 <https://github.com/Gregory-307/web-search-sdk>






In [None]:
import subprocess, sys, pathlib, os, shutil, errno

REPO_URL = "https://github.com/Gregory-307/web-search-sdk.git"
REPO_DIR = pathlib.Path("/content/web-search-sdk")   # explicit, avoids confusion

# --- SAFETY GUARD: Multiple checks to ensure we're in Google Colab ---
def is_google_colab():
    """Check if we're running in Google Colab environment"""
    # Method 1: Check if google.colab module is available
    try:
        import google.colab
        return True
    except ImportError:
        pass
    
    # Method 2: Check if google.colab is in sys.modules
    if "google.colab" in sys.modules:
        return True
    
    # Method 3: Check environment variables
    import os
    if os.environ.get('COLAB_GPU') or os.environ.get('COLAB_TPU'):
        return True
    
    # Method 4: Check if we're in /content directory (Colab's working dir)
    if str(pathlib.Path.cwd()).startswith('/content'):
        return True
    
    return False

if not is_google_colab():
    raise RuntimeError(
        "❌ SAFETY CHECK FAILED: This notebook is NOT running in Google Colab!\n"
        "• Current working directory: " + str(pathlib.Path.cwd()) + "\n"
        "• sys.modules contains google.colab: " + str("google.colab" in sys.modules) + "\n"
        "• Environment variables: COLAB_GPU=" + str(os.environ.get('COLAB_GPU', 'None')) + 
        ", COLAB_TPU=" + str(os.environ.get('COLAB_TPU', 'None')) + "\n"
        "Refusing to delete or modify any local files. This cell is for Colab only."
    )

print("✅ SAFETY CHECK PASSED: Running in Google Colab - proceeding with cleanup")
# ---------------------------------------------------------------------------

# 1) Hard-delete any previous checkout (shell rm -rf is the most reliable)
if REPO_DIR.exists():
    print(f"Removing existing folder: {REPO_DIR}")
    subprocess.call(["rm", "-rf", str(REPO_DIR)])   # ignore errors

# 2) Double-check and fall back to shutil (Windows attributes, etc.)
try:
    shutil.rmtree(REPO_DIR)
except FileNotFoundError:
    pass
except OSError as e:  # read-only files edge-case
    if e.errno != errno.ENOENT:
        raise

# 3) Fresh clone
print("Cloning latest repo …")
subprocess.check_call(["git", "clone", "--depth", "1", REPO_URL, str(REPO_DIR)])

ROOT = REPO_DIR.resolve()

# 4) Remove any installed egg / wheel + pip cache
subprocess.call([sys.executable, "-m", "pip", "uninstall", "-y", "web-search-sdk"],
                stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
subprocess.call([sys.executable, "-m", "pip", "cache", "purge"],
                stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

# 5) Clean install (editable) + Playwright + browser binaries
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "--no-cache-dir",
                       "-e", f"{ROOT}[browser]"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "playwright"])
subprocess.check_call([sys.executable, "-m", "playwright", "install", "--with-deps"],
                      stdout=subprocess.DEVNULL)

# 6) Ensure editable repo is first on sys.path
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

print("✅ Colab environment wiped & SDK re-installed")

RuntimeError: ❌ SAFETY CHECK FAILED: This notebook is NOT running in Google Colab!
• Current working directory: c:\Users\gregb\OneDrive\Documents\GitHub\web-search-sdk
• sys.modules contains google.colab: False
• Environment variables: COLAB_GPU=None, COLAB_TPU=None
Refusing to delete or modify any local files. This cell is for Colab only.


2  Quick Smoke Test

Built-in `smoke_test.py` script runs without network access.
**Expected output**: version string + globals dictionary showing imported SDK functions + smoke test results with top-tokens for "openai" (default term). This takes <2 s.


In [None]:
import runpy, importlib.metadata as md, pathlib
print("web_search_sdk version:", md.version("web-search-sdk"))
runpy.run_path(str((pathlib.Path.cwd() / 'smoke_test.py')))

web_search_sdk version: 0.2.0


FileNotFoundError: [Errno 2] No such file or directory: 'c:\\Users\\gregb\\OneDrive\\Documents\\GitHub\\web-search-sdk\\web-search-sdk\\smoke_test.py'

# SDK Functionality Cheetsheet

`ScraperContext` is the _single_ configuration object shared by every helper
in the SDK. You pass it for every function.

It controls:
- Browser usage: When to use Selenium/Playwright vs plain HTTP requests (requests often fail still)
- Network settings like timeouts or retry attempts
- Identity management: Custom User-Agent rotation, proxy support
- Debugging: Verbose logging for troubleshooting

The cell below instantiates three ready-made contexts to reuse in later
examples. Th



In [None]:
from web_search_sdk.scrapers.base import ScraperContext

ctx = ScraperContext()

# When you call ScraperContext() with no arguments, you get these defaults:
# - timeout: 20.0
# - retries: 2
# - use_browser: False
# - browser_type: "selenium"
# - debug: False
# - headers: {}
# - user_agents: None
# - proxy: None

## Custom examples (skip this section if not important)

In [None]:
# 1. HTTP-only context (fastest, most basic)
ctx_http = ScraperContext(
    debug=True,                    # Show HTTP requests in console
    timeout=30.0,                  # 30 second timeout
    retries=3,                     # Retry failed requests 3 times
    user_agents=["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"]  # Custom UA
)

# 2. Selenium context (reliable, slower, good for simple sites)
ctx_selen = ScraperContext(
    use_browser=True,              # Enable browser fallback
    browser_type="selenium",       # Use Firefox via Selenium
    debug=True,                    # Show browser actions
    timeout=45.0,                  # Longer timeout for browser
    retries=2,                     # Fewer retries (browser is expensive)
    proxy="http://proxy.example.com:8080"  # Optional proxy
)

# 3. Playwright stealth context (best for CAPTCHA-heavy sites)
ctx_play = ScraperContext(
    use_browser=True,              # Enable browser fallback
    browser_type="playwright_stealth",  # Chromium with anti-bot patches
    debug=True,                    # Show browser actions
    timeout=60.0,                  # Longest timeout (stealth takes time)
    retries=1,                     # Minimal retries (very expensive)
    headers={                      # Custom headers for stealth
        "Accept-Language": "en-US,en;q=0.9",
        "Accept-Encoding": "gzip, deflate, br",
        "DNT": "1"
    }
)

# Display the contexts to see their configurations
print("=== HTTP Context ===")
print(f"Browser: {ctx_http.use_browser}")
print(f"Timeout: {ctx_http.timeout}s")
print(f"Retries: {ctx_http.retries}")
print(f"Debug: {ctx_http.debug}")

print("\n=== Selenium Context ===")
print(f"Browser: {ctx_selen.use_browser} ({ctx_selen.browser_type})")
print(f"Timeout: {ctx_selen.timeout}s")
print(f"Retries: {ctx_selen.retries}")
print(f"Proxy: {ctx_selen.proxy}")

print("\n=== Playwright Stealth Context ===")
print(f"Browser: {ctx_play.use_browser} ({ctx_play.browser_type})")
print(f"Timeout: {ctx_play.timeout}s")
print(f"Retries: {ctx_play.retries}")
print(f"Custom headers: {len(ctx_play.headers)} headers")

=== HTTP Context ===
Browser: False
Timeout: 30.0s
Retries: 3
Debug: True

=== Selenium Context ===
Browser: True (selenium)
Timeout: 45.0s
Retries: 2
Proxy: http://proxy.example.com:8080

=== Playwright Stealth Context ===
Browser: True (playwright_stealth)
Timeout: 60.0s
Retries: 1
Custom headers: 3 headers


# Main Menu
---

### 🌐 [Search](#scrollTo=jjvjnyumbsZM)
- **DuckDuckGo Search** – primary search engine (reliable, no CAPTCHA)  
  ```python
  ddg_result = await ddg_search_and_parse("bitcoin rally")

  ddg_result["links"]       → list of links
  ddg_result["top_words"]   → most-frequent words
  ddg_result["results"]     → list of dicts (title, snippet, url, source)

  raw_html = await ddg_search_raw("bitcoin rally")   # BeautifulSoup object
  ```
- ~~**Google Search** – alternative engine (may hit CAPTCHA)~~ *(currently disabled)*  
  ~~`google_web_top_words("bitcoin")`~~

### 📰 [News](#scrollTo=VxoJ-ejpHnC8)
- **Google News**  
  ```python
  news_result = await google_news("bitcoin")

  news_result["headlines"]  → list of headlines
  news_result["summaries"]  → list of summaries
  news_result["sources"]    → list of sources
  news_result["top_words"]  → keyword frequency

  raw_rss = await google_news_raw("bitcoin")   # raw RSS string
  ```

### 📝 [Content Extraction](#scrollTo=lqWHXXfmJbLa)
- **Content Extraction (for articles with no paywall)**  
  ```python
  article = await extract_article_content("https://example.com/article")

  article["title"]         → article title
  article["content"]       → clean text
  article["summary"]       → short summary
  article["author"]        → author name
  article["publish_date"]  → YYYY-MM-DD
  article["source"]        → site name
  ```

### 📚 [Knowledge Banks](#scrollTo=IZjJWe84ITeo)
- **Wikipedia**  
  ```python
  wiki = await wikipedia("bitcoin")

  wiki["title"]       → page title
  wiki["content"]     → full article text
  wiki["links"]       → internal links
  wiki["top_words"]   → frequent words

  raw_html = await wikipedia_raw("bitcoin")   # raw HTML string
  ```
- **RelatedWords.org**  
  ```python
  words = await related_words("bitcoin")   # returns list[str]
  ```

<br>

### Additional Functionality

#### 🛠️ [Utilities & Pipeline](#scrollTo=sXKJQCV9IUia)
- Text helpers: `tokenise()`, `remove_stopwords()`, `most_common()`  
- Output helpers: `to_json()`, `to_csv()`  
- Rate limiting: `@rate_limited` decorator  
- Parallel scraping: `gather_scrapers()`

#### 💹 Financial Data
- **Stock Information**  
  ```python
  stock = await fetch_stock_data("AAPL")

  stock["ohlcv"]   → OHLCV via yfinance
  stock["info"]    → company info
  ```

#### 📊 Trends & Analytics
- **Google Trends** *(deprecated – migrate to trends-sdk)*  
  ```python
  trends = await interest_over_time("bitcoin")

  trends["values"]  → trend values
  trends["dates"]   → corresponding dates
  ```

  ```python
  batch = await batch_interest_over_time(["btc", "eth"])

  batch["results"]  → trends for multiple terms
  ```

---

Each function accepts an optional `ScraperContext` for custom headers, proxies, browser fallback, etc.  
The examples above rely on the built-in defaults, so you can run them as-is.

# Demo Examples



## 🌐 Search

#### DuckDuckGo Search Functions

1 Linear (single query)

In [None]:
from web_search_sdk.scrapers.duckduckgo_enhanced import ddg_search_and_parse

ddg_result = await ddg_search_and_parse("bitcoin rally", top_n=5)

print("Links:\n" + "\n".join(ddg_result["links"]))
print("Top words:", ddg_result["top_words"])
print("First title:", ddg_result["results"][0]["title"] if ddg_result["results"] else "none")


# Result objects look like:
# {
#     'title'  : 'Bitcoin hits fresh high as ...',
#     'snippet': 'The cryptocurrency surged past ...',
#     'url'    : 'https://www.cnbc.com/...',
#     'source' : 'CNBC'
# }


# top_n defaults to 10; it the hard max on *all* three lists
#   • links
#   • top_words
#   • results
# (functionality will change to be independant)

Links:
https://www.thestreet.com/crypto/markets/bitcoin-experts-explain-why-its-set-to-rally-to-200000-this-year
https://www.benzinga.com/crypto/cryptocurrency/25/07/46345201/bitcoins-rally-to-112000-yes-its-different-this-time
https://www.businessinsider.com/bitcoin-price-today-why-is-btc-up-crypto-week-etfs-2025-7?op=1
https://cointelegraph.com/news/bitcoin-price-likely-to-hit-dollar130k-before-serious-profit-taking-kicks-in
https://www.marketwatch.com/story/bitcoin-hits-record-high-why-the-cryptos-rally-may-lose-steam-after-briefly-topping-112-000-00941186
Top words: ['bitcoin', 'rally', 'high', 'crypto', 'time']
First title: Bitcoin experts explain why it's set to rally to $200,000 this year


2 Async (two queries in parallel)

In [None]:
# %% DuckDuckGo – async / parallel example
import asyncio
from web_search_sdk.scrapers.duckduckgo_enhanced import ddg_search_and_parse

terms = ["bitcoin rally", "ethereum futures"]

results = await asyncio.gather(*(ddg_search_and_parse(t, top_n=5) for t in terms))

for term, res in zip(terms, results):
    print(f"\n{term.upper()}")
    print("Links:", res["links"])
    print("Top words:", res["top_words"])


BITCOIN RALLY
Links: ['https://www.benzinga.com/crypto/cryptocurrency/25/07/46345201/bitcoins-rally-to-112000-yes-its-different-this-time', 'https://www.usatoday.com/story/money/investing/2025/07/15/bitcoin-2025-surge-will-keep-going/85187579007/', 'https://www.businessinsider.com/bitcoin-price-today-why-is-btc-up-crypto-week-etfs-2025-7?op=1', 'https://www.thestreet.com/crypto/markets/bitcoin-experts-explain-why-its-set-to-rally-to-200000-this-year', 'https://cointelegraph.com/news/bitcoin-price-likely-to-hit-dollar130k-before-serious-profit-taking-kicks-in']
Top words: ['bitcoin', 'rally', 'time', 'crypto', 'year']

ETHEREUM FUTURES
Links: ['https://www.cmegroup.com/markets/cryptocurrencies/ether/ether.html', 'https://www.investing.com/crypto/ethereum/ether-futures', 'https://www.tradingview.com/symbols/CME-ETH1!/', 'https://www.binance.com/en/futures/etH_USDT', 'https://www.coinbase.com/advanced-trade/crypto-futures']
Top words: ['futures', 'ether', 'trade', 'crypto', 'overview']


3 Custom context and raw html

Configure contexts if you need (not necessary but allows for customisation).\
Each function has it's own default ctx (that I have tuned to work but that may change over time).
```
ddg_search_and_parse()'s default is this:

   ctx = ScraperContext(
      use_browser=False,   # plain HTTP, no browser fallback
      headers={},          # no custom headers (UA added internally)
      timeout=20.0,        # seconds
      retries=2,           # retry count on failures
      proxy=None,          # no proxy
      debug=False,         # quiet mode
      browser_type="selenium"  # ignored unless use_browser=True
  )
```



In [None]:
# %% DuckDuckGo – custom context + raw HTML
from web_search_sdk.scrapers.base import ScraperContext
from web_search_sdk.scrapers.duckduckgo_enhanced import (
    ddg_search_and_parse,
    ddg_search_raw,
)


# Custom tweeks to config:
ctx = ScraperContext(
    use_browser=True,
    browser_type="playwright_stealth",
    timeout=40,
    debug=True,
)

structured = await ddg_search_and_parse("openai gpt-4", ctx=ctx, top_n=3)
raw_soup   = await ddg_search_raw("openai gpt-4", ctx=ctx)

print("Structured links:", structured["links"])
print("Raw HTML length:", len(raw_soup.prettify()))

💡 Tip: ddg_search_and_parse works fine with HTTP context (faster). Browser context is optional.


{"url": "https://html.duckduckgo.com/html/?q=openai%20gpt-4&kl=us-en", "event": "http_get", "timestamp": "2025-07-17T15:26:08.233009Z"}
HTTP Request: GET https://html.duckduckgo.com/html/?q=openai%20gpt-4&kl=us-en "HTTP/1.1 200 OK"


💡 Tip: ddg_search_raw works fine with HTTP context (faster). Browser context is optional.


{"url": "https://html.duckduckgo.com/html/?q=openai%20gpt-4&kl=us-en", "event": "http_get", "timestamp": "2025-07-17T15:26:10.658014Z"}


<br>

## 📰 News


1 Google News – linear (single query)


In [None]:
from web_search_sdk.scrapers.news import google_news

news = await google_news("bitcoin", top_n=5)   # top_n=20 by default

print("Headlines:")
for h in news["headlines"]:
    print("  -", h)

print("\nTop words:", news["top_words"])

# Show the first structured item so users understand the shape
if news["headlines"]:
    print("\nExample record:")
    print({
        "headline" : news["headlines"][0],
        "summary"  : news["summaries"][0],
        "source"   : news["sources"][0]
    })

Headlines:
  - Why is the price of bitcoin hitting a record high? Experts explain. - ABC News - Breaking News, Latest News and Videos
  - Ether is starting to outperform bitcoin – and further gains could be ahead for the crypto, charts show - CNBC
  - Asia Morning Briefing: Bitcoin Eyes $130K as Euphoria Builds, But ETH and SOL Steal the Show - CoinDesk
  - Bitcoin, ETH, XRP, Solana Are In For A 'Long And Exhausting Bull Market,' Bernstein Predicts - Yahoo Finance
  - Cantor Fitzgerald close to $4bn Spac deal with bitcoin pioneer - Financial Times

Top words: ['nbsp', 'font', 'news', 'bitcoin', 'href']

Example record:
{'headline': 'Why is the price of bitcoin hitting a record high? Experts explain. - ABC News - Breaking News, Latest News and Videos', 'summary': '<ol><li><a href="https://news.google.com/rss/articles/CBMiogFBVV95cUxPTjdZN0JTR1JudW5iUFBqTklxclJsQ1cwY0xvWnMxUmdYSFkwRWlRMjEwUTdSUnFFNDd3OWV0MlQ5TTFFZDRpWkVtNXZzOG8tb3V1eGl4VXhyT0FxeTZIck5iNkw4RFpuRzZxMzQ5MlRQaXcwUkxiTjQyaDdQ

2 Google News – async (two queries in parallel)


In [None]:
import asyncio
from web_search_sdk.scrapers.news import google_news

terms = ["bitcoin", "ethereum"]

results = await asyncio.gather(*(google_news(t, top_n=5) for t in terms))

for term, res in zip(terms, results):
    print(f"\n{term.upper()}")
    print("Headlines:")
    for h in res["headlines"]:
        print("  -", h)
    print("Top words:", res["top_words"])


BITCOIN
Headlines:
  - Why is the price of bitcoin hitting a record high? Experts explain. - ABC News - Breaking News, Latest News and Videos
  - Ether is starting to outperform bitcoin – and further gains could be ahead for the crypto, charts show - CNBC
  - Asia Morning Briefing: Bitcoin Eyes $130K as Euphoria Builds, But ETH and SOL Steal the Show - CoinDesk
  - Bitcoin, ETH, XRP, Solana Are In For A 'Long And Exhausting Bull Market,' Bernstein Predicts - Yahoo Finance
  - Cantor Fitzgerald close to $4bn Spac deal with bitcoin pioneer - Financial Times
Top words: ['nbsp', 'font', 'news', 'bitcoin', 'href']

ETHEREUM
Headlines:
  - ETH rises above $3,300 as spot Ethereum ETFs post record $726 million in daily net inflows - The Block
  - SEC Delays Decision on Bitwise Bitcoin, Ethereum ETFs' In-kind Redemptions - Decrypt
  - Ethereum ETFs attract record $726 million, pushing ETH price above $3,400 - CryptoSlate
  - Beyond Bitcoin: Ethereum as a Corporate Treasury Asset - galaxy.com
 

<br>

## 📝 Content Extraction


1 Article Extraction – linear (single URL)


In [None]:
from web_search_sdk.scrapers.base import ScraperContext
from web_search_sdk.scrapers import extract_article_content

url = "https://www.cnbc.com/2025/07/14/bitcoin-hits-new-all-time-high-above-120000-fueled-by-etf-inflows-crypto.html"

ctx = ScraperContext(
    use_browser=True,
    browser_type="playwright_stealth",
    timeout=40,
    debug=True,
)

article = await extract_article_content(url, ctx=ctx)

if article.get("error"):
    print("❌ Extraction failed:", article["error"])
else:
    print("Title  :", article["title"])
    print("Source :", article["source"])
    print("Summary:", article["summary"][:150] + "…")
    print("\nFirst 300 chars of content:\n", article["content"][:300], "…")

[browser:PW] GET https://www.cnbc.com/2025/07/14/bitcoin-hits-new-all-time-high-above-120000-fueled-by-etf-inflows-crypto.html
Title  : Bitcoin hits new high above $120,000 as U.S. lawmakers begin ‘Crypto Week’
Source : CNBC
Summary: Representations of Bitcoins resting on U.S…

First 300 chars of content:
 Representations of Bitcoins resting on U.S. dollar banknotes Maeva Destombes | Afp | Getty Images Bitcoin traded above $120,000 to set a new record high on Monday as U.S. lawmakers gear up to potentially pass regulatory changes that could bolster institutional demand. The largest cryptocurrency by m …


2 Article Extraction – async (two URLs in parallel)

**FYI: Bloomberg is paywalled and will detect this traffic, it will always fail.
Please see here as an example.**

In [None]:
import asyncio
from web_search_sdk.scrapers.base import ScraperContext
from web_search_sdk.scrapers import extract_article_content

urls = [
    "https://www.cnbc.com/2025/07/14/bitcoin-hits-new-all-time-high-above-120000-fueled-by-etf-inflows-crypto.html",
    "https://www.bloomberg.com/news/articles/2025-07-14/ethereum-jumps-as-upgrade-optimism-builds"
]

# One shared context with Playwright enabled
ctx = ScraperContext(
    use_browser=True,
    browser_type="playwright_stealth",
    timeout=40,
    debug=True,
)

articles = await asyncio.gather(*(extract_article_content(u, ctx=ctx) for u in urls))

for art in articles:
    if art.get("error"):
        print(f"\n❌ FAILED for {art['url']}\n   → {art['error']}")
    else:
        print(f"\n--- {art['source']} ---")
        print("Title :", art["title"])
        print("Chars :", len(art["content"]))



[browser:PW] GET https://www.cnbc.com/2025/07/14/bitcoin-hits-new-all-time-high-above-120000-fueled-by-etf-inflows-crypto.html
[browser:PW] GET https://www.bloomberg.com/news/articles/2025-07-14/ethereum-jumps-as-upgrade-optimism-builds

--- CNBC ---
Title : Bitcoin hits new high above $120,000 as U.S. lawmakers begin ‘Crypto Week’
Chars : 3430

--- BLOOMBERG ---
Title : Bloomberg - Are you a robot?
Chars : 597


<br>

## 📚 Knowledge Banks


1 Wikipedia - Linear (single topic)

In [None]:
from web_search_sdk.scrapers.wikipedia import wikipedia

wiki = await wikipedia("bitcoin", top_n=10)

print("Title :", wiki["title"])
print("Top words:", ", ".join(wiki["top_words"]))
print("Links sample:", wiki["links"][:10])
print("Content preview:\n", wiki["content"][:400], "…")

Title : Bitcoin
Top words: bitcoin, retrieved, original, archived, november, december, january, june, october, july
Links sample: ['Bit (money)', 'Thai baht', 'Satoshi Nakamoto', 'White paper', 'Bitcoin Core', 'Latest release', 'Code repository', 'Written in', 'Free and open-source software', 'License']
Content preview:
 Decentralized digital currency 
 For the colloquial expression for coinage, see  Bit (money) . 
 "₿" redirects here and is not to be confused with "฿" for  Thai baht . 
 
 
 Bitcoin Commonly used logo of bitcoin Denominations Plural bitcoins Symbol ₿ (Unicode:  U+20BF   ₿   BITCOIN SIGN ) [ 1 ] Code BTC Precision 10 −8 Subunits   1 ⁄ 1000 millibitcoin   1 ⁄ 1 000 000 microbitcoin   1 ⁄ 100 000 000 …


2 Wikipedia - async (two topic in parallel)

In [None]:
import asyncio
from web_search_sdk.scrapers.wikipedia import wikipedia

topics = ["bitcoin", "ethereum"]

pages = await asyncio.gather(*(wikipedia(t, top_n=5) for t in topics))

for t, p in zip(topics, pages):
    print(f"\n{t.upper()} – top words: {p['top_words']}")


BITCOIN – top words: ['bitcoin', 'retrieved', 'original', 'archived', 'november']

ETHEREUM – top words: ['ethereum', 'retrieved', 'original', 'archived', 'blockchain']


3 Wikipedia – custom ScraperContext + raw HTML


In [None]:
from web_search_sdk.scrapers.base import ScraperContext
from web_search_sdk.scrapers.wikipedia import wikipedia, wikipedia_raw

ctx = ScraperContext(debug=True, timeout=30)

page      = await wikipedia("openai", ctx=ctx, top_n=8)
raw_html  = await wikipedia_raw("openai", ctx=ctx)

print("Links (capped to 8):", page["links"])
print("Raw HTML length:", len(raw_html))

Links (capped to 8): ['OpenAL', 'OpenAPI', 'Open-source artificial intelligence', 'Private', 'Artificial intelligence', 'Sam Altman', 'Elon Musk', 'Ilya Sutskever']
Raw HTML length: 590508


4 RelatedWords.org – linear (single seed)


In [None]:
from web_search_sdk.scrapers.related import related_words

words = await related_words("bitcoin")
print("Related words (first 20):\n", words[:20])

Related words (first 20):
 ['blockchain', 'currency', 'bitcoin network', 'cryptography', 'node', 'satoshi nakamoto', 'open-source software', 'cryptocurrency wallet', 'cryptocurrency exchange', 'cryptocurrency', 'public-key cryptography', 'ethereum', 'qt', 'central bank', 'distributed ledger', 'university of cambridge', 'nobel memorial prize in economic sciences', 'gavin andresen', 'silk road', 'youtube']


5 RelatedWords.org – async (two seeds in parallel)


In [None]:
import asyncio
from web_search_sdk.scrapers.related import related_words

seeds = ["bitcoin", "blockchain"]

word_lists = await asyncio.gather(*(related_words(s) for s in seeds))

for s, wl in zip(seeds, word_lists):
    print(f"\n{s.upper()} – {len(wl)} words")
    print(", ".join(wl[:15]), "…")


BITCOIN – 288 words
blockchain, currency, bitcoin network, cryptography, node, satoshi nakamoto, open-source software, cryptocurrency wallet, cryptocurrency exchange, cryptocurrency, public-key cryptography, ethereum, qt, central bank, distributed ledger …

BLOCKCHAIN – 156 words
bitcoin, ledger, peer-to-peer, cryptocurrency, ethereum, cryptocurrencies, cryptography, merkle tree, distributed ledger, proof-of-work system, server, trusted timestamping, hashcash, satoshi nakamoto, digital signature …


<br>

## 🛠️ Utilities & Pipeline


1 Text helpers – tokenise / stop-word removal / frequency

In [None]:
from web_search_sdk.utils.text import tokenise, remove_stopwords, most_common

sentence = "Bitcoin hits a new all-time high as investors brace for another rally."

tokens        = tokenise(sentence)
clean_tokens  = remove_stopwords(tokens)
top_freq      = most_common(tokens, 5)

print("Tokens         :", tokens)
print("No stop-words  :", clean_tokens)
print("Top frequency  :", top_freq)

Tokens         : ['bitcoin', 'hits', 'new', 'all', 'time', 'high', 'as', 'investors', 'brace', 'for', 'another', 'rally']
No stop-words  : ['bitcoin', 'hits', 'new', 'time', 'high', 'investors', 'brace', 'another', 'rally']
Top frequency  : ['bitcoin', 'hits', 'new', 'time', 'high']


2 Output helpers – write JSON & CSV

In [None]:
from web_search_sdk.utils.output import to_json, to_csv
import pathlib, json, csv

pathlib.Path("out").mkdir(exist_ok=True)

json_path = "out/demo_tokens.json"
csv_path  = "out/demo_tokens.csv"

data = ["bitcoin", "ethereum", "solana"]

to_json(data, json_path)                    # overwrite by default
to_csv([{"token": t} for t in data], csv_path)

print("Wrote JSON →", json_path, "| bytes:", pathlib.Path(json_path).stat().st_size)
print("Wrote CSV  →", csv_path,  "| rows :", sum(1 for _ in open(csv_path)))

Wrote JSON → out/demo_tokens.json | bytes: 41
Wrote CSV  → out/demo_tokens.csv | rows : 4


3 Rate-limit decorator – two calls per second


In [None]:
import asyncio
from web_search_sdk.utils.rate_limit import rate_limited

@rate_limited(calls=2, period=1.0)      # max 2 calls / second
async def ping(i: int):
    print("tick", i)

await asyncio.gather(*[ping(i) for i in range(6)])

tick 0
tick 1
tick 2
tick 3
tick 4
tick 5


[None, None, None, None, None, None]

4 Parallel scraping helper – gather_scrapers

In [None]:
# %% Parallel scraping – first link per term (fixed)
import asyncio
from web_search_sdk.scrapers.base import gather_scrapers
from web_search_sdk.scrapers.duckduckgo_enhanced import _fetch_html as ddg_fetch
from web_search_sdk.scrapers.duckduckgo_enhanced import _parse_html as ddg_parse

terms = ["bitcoin", "ethereum", "solana"]

def _parse_wrapper(html: str, term: str, ctx):
    parsed = ddg_parse(html, top_n=1)        # returns dict with "links"
    return parsed["links"][0] if parsed["links"] else None

links = await gather_scrapers(
    terms,
    fetch=ddg_fetch,        # enhanced fetch (same HTML endpoint)
    parse=_parse_wrapper,
    parallelism=3,
)

print("Fetched links:", links)

Fetched links: ['https://bitcoin.org/', 'https://ethereum.org/en/', 'https://solana.com/']


# Final Demo

5 Mini pipeline – search → article extraction (sentiment-ready)


In [None]:
# %% Mini pipeline – search → article extraction (robust)
import asyncio, textwrap
from web_search_sdk.scrapers.search import search_and_parse
from web_search_sdk.scrapers import extract_article_content
from web_search_sdk.scrapers.base import ScraperContext

ctx_play = ScraperContext(use_browser=True,
                          browser_type="playwright_stealth",
                          timeout=40)

def wrap(text, width=90):
    return "\n".join(textwrap.wrap(text, width))

async def pipeline(topic: str, top_n:int=5):
    print(f"Searching news for: {topic!r}")
    res = await search_and_parse(topic, top_n=top_n)

    successes, failures = [], []
    for r in res["results"]:
        url = r["url"]
        if not url:
            continue
        art = await extract_article_content(url, ctx=ctx_play)
        if art.get("content"):
            successes.append({
                "title": art["title"],
                "source": art["source"],
                "chars": len(art["content"]),
                "url": url
            })
        else:
            failures.append({"url": url, "reason": art.get("error","empty/blocked")})

    return successes, failures

succ, fail = await pipeline("bitcoin rally")

print("\n✅ Extracted articles:")
for a in succ:
    print(f"- {a['title']}  ({a['source']}, {a['chars']} chars)")

print("\n❌ Failures:")
for f in fail:
    print(f"- {f['url']}  → {f['reason']}")

Searching news for: 'bitcoin rally'

✅ Extracted articles:
- Bitcoin's Rally To $112,000: Yes, It's Different This Time  (BENZINGA, 9052 chars)
- Bitcoin is on a record-setting hot streak. 3 things are driving the latest rally in the top crypto.  (BUSINESSINSIDER, 5122 chars)
- Bitcoin price likely to hit $130K before serious profit taking kicks in  (COINTELEGRAPH, 3191 chars)

❌ Failures:
- https://www.thestreet.com/crypto/markets/bitcoin-experts-explain-why-its-set-to-rally-to-200000-this-year  → empty/blocked
- https://www.marketwatch.com/story/bitcoin-hits-record-high-why-the-cryptos-rally-may-lose-steam-after-briefly-topping-112-000-00941186  → empty/blocked
