In [1]:
!pip install langchain langchain-community langchain-core langchain-text-splitters beautifulsoup4 requests python-dotenv dashscope lxml html2text

Collecting dashscope
  Downloading dashscope-1.24.6-py3-none-any.whl.metadata (7.1 kB)
Collecting html2text
  Downloading html2text-2025.4.15-py3-none-any.whl.metadata (4.1 kB)
Downloading dashscope-1.24.6-py3-none-any.whl (1.3 MB)
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.3 MB ? eta -:--:--
   ------

In [1]:
import os
import re
import time
import json
import asyncio
import aiohttp
import tldextract
from typing import List, Dict, Optional, Any, Set
from bs4 import BeautifulSoup
from dataclasses import dataclass, field
from urllib.parse import urljoin, urlparse
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

OPENROUTER_API_KEY = "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"
OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1"
QWEN_MODEL = "qwen/qwen-3-4b"


In [3]:
from langchain.llms.base import LLM
from pydantic import BaseModel
import requests

class QwenLLM(LLM, BaseModel):
    api_key: str = OPENROUTER_API_KEY
    api_url: str = f"{OPENROUTER_BASE_URL}/chat/completions"
    model: str = QWEN_MODEL
    temperature: float = 0.0

    class Config:
        arbitrary_types_allowed = True

    def _call(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        headers = {
            "Authorization": f"Bearer {self.api_key}",
            "HTTP-Referer": "http://localhost",
            "X-Title": "IntelligentCrawler",
            "Content-Type": "application/json",
        }
        payload = {
            "model": self.model,
            "messages": [{"role": "user", "content": prompt}],
            "temperature": self.temperature,
        }
        resp = requests.post(self.api_url, headers=headers, json=payload, timeout=60)
        resp.raise_for_status()
        j = resp.json()
        return j["choices"][0]["message"]["content"]

    async def _acall(self, prompt: str, stop: Optional[List[str]] = None) -> str:
        return self._call(prompt, stop)

    @property
    def _identifying_params(self):
        return {"model": self.model}

    @property
    def _llm_type(self):
        return "qwen"

llm = QwenLLM()


In [4]:
@dataclass
class CrawlerAgent:
    session: aiohttp.ClientSession
    max_pages: int = 100
    max_depth: int = 2
    delay: float = 0.5
    visited: Set[str] = field(default_factory=set)
    domain_limit: Optional[str] = None

    async def fetch(self, url: str) -> Optional[str]:
        try:
            async with self.session.get(url, timeout=20) as resp:
                if resp.status == 200 and 'text' in resp.headers.get('content-type',''):
                    text = await resp.text(errors='ignore')
                    await asyncio.sleep(self.delay)
                    return text
        except Exception as e:
            print(f"[fetch error] {url} -> {e}")
        return None

    def extract_links_and_text(self, base_url: str, html: str) -> Dict[str, Any]:
        soup = BeautifulSoup(html, "html.parser")
        texts = [t.get_text(" ", strip=True) for t in soup.find_all(['p','h1','h2','h3','li'])]
        links = set()
        for a in soup.find_all("a", href=True):
            href = a['href'].strip()
            if href.startswith(("mailto:", "javascript:")):
                continue
            full = urljoin(base_url, href)
            parsed = urlparse(full)
            if parsed.scheme in ("http","https"):
                links.add(full.split('#')[0])
        return {"texts": texts, "links": list(links)}

    async def crawl(self, seed_url: str):
        to_visit = [(seed_url, 0)]
        self.domain_limit = tldextract.extract(seed_url).registered_domain
        results = []
        while to_visit and len(self.visited) < self.max_pages:
            url, depth = to_visit.pop(0)
            if url in self.visited or depth > self.max_depth:
                continue
            if self.domain_limit and tldextract.extract(url).registered_domain != self.domain_limit:
                continue
            html = await self.fetch(url)
            self.visited.add(url)
            if not html:
                continue
            parsed = self.extract_links_and_text(url, html)
            results.append({"url": url, "text_blocks": parsed["texts"], "links": parsed["links"]})
            for link in parsed["links"]:
                if link not in self.visited and len(self.visited)+len(to_visit) < self.max_pages:
                    to_visit.append((link, depth+1))
        return results


In [7]:
@dataclass
class CleaningAgent:
    llm: QwenLLM
    stop_words: set = field(default_factory=lambda: set(stopwords.words('english')))

    def basic_clean_text(self, text: str) -> str:
        return re.sub(r'\s+', ' ', text).strip()

    def merge_blocks(self, blocks: List[str]) -> str:
        seen, out = set(), []
        for b in blocks:
            b_clean = b.strip()
            if b_clean and b_clean.lower() not in seen:
                seen.add(b_clean.lower())
                out.append(b_clean)
        return "\n\n".join(out)

    def evaluate_quality(self, text: str) -> Dict[str, Any]:
        words = word_tokenize(text)
        return {
            "num_words": len(words),
            "avg_word_len": sum(len(w) for w in words)/max(1,len(words)),
            "stop_words_pct": sum(1 for w in words if w.lower() in self.stop_words)/max(1,len(words))
        }

    def clean_and_summarize(self, blocks: List[str]) -> Dict[str, Any]:
        merged = self.merge_blocks([self.basic_clean_text(b) for b in blocks])
        quality = self.evaluate_quality(merged)
        try:
            cleaned = self.llm._call(
                f"Clean this web text. Remove navigation/boilerplate, keep only meaningful content:\n{merged}"
            )
        except:
            cleaned = merged
        try:
            summary = self.llm._call(
                f"Summarize this text in 3-6 sentences and list 5 key points:\n{cleaned}"
            )
        except:
            summary = cleaned[:1000]
        return {"cleaned_text": cleaned, "summary": summary, "quality": quality}


In [9]:
async def run_pipeline(user_input: str,
                       max_pages: int = 30,
                       max_depth: int = 2,
                       delay: float = 0.5):
    is_url = user_input.startswith(("http://", "https://"))
    seed_urls = [user_input] if is_url else []

    if not is_url:
        try:
            resp = llm._call(
                f"Suggest up to 5 reliable URLs to learn about: {user_input}. Return only JSON array."
            )
            seed_urls = re.findall(r"https?://[^\s,\"\]]+", resp)[:5]
        except:
            return {"error":"no_seed_urls"}

    async with aiohttp.ClientSession(headers={"User-Agent":"IntelligentCrawler/1.0"}) as session:
        crawler = CrawlerAgent(session, max_pages, max_depth, delay)
        all_pages, visited = [], set()
        for seed in seed_urls:
            pages = await crawler.crawl(seed)
            all_pages.extend(pages)
            if len(crawler.visited) >= max_pages: break

    cleaner = CleaningAgent(llm=llm)
    cleaned_results = [
        {"url": p["url"], **cleaner.clean_and_summarize(p.get("text_blocks", []))}
        for p in all_pages
    ]
    return {"seed_urls": seed_urls, "raw_pages": all_pages, "cleaned": cleaned_results}


In [11]:
def run_sync(user_input: str, **kwargs):
    return asyncio.run(run_pipeline(user_input, **kwargs))

In [13]:
!pip install nest_asyncio
import nest_asyncio
nest_asyncio.apply()




In [15]:
seed = "https://en.wikipedia.org/wiki/Heliocentrism"  
out = run_sync(seed, max_pages=5, max_depth=1, delay=0.3)

print("Scraped pages:", len(out.get("raw_pages", [])))
pd.DataFrame([
    {"url":c["url"], "summary": c["summary"], "cleaned": c["cleaned_text"]}
    for c in out["cleaned"]
]).to_csv("crawler_cleaned_results.csv", index=False)
print("Saved crawler_cleaned_results.csv")


  self.domain_limit = tldextract.extract(seed_url).registered_domain
  if self.domain_limit and tldextract.extract(url).registered_domain != self.domain_limit:


Scraped pages: 4
Saved crawler_cleaned_results.csv


In [29]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arsen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\arsen\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arsen\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True