In [1]:
!pip install beautifulsoup4 requests transformers langchain_community




In [5]:
!pip install html2text

Collecting html2text
  Downloading html2text-2025.4.15-py3-none-any.whl.metadata (4.1 kB)
Downloading html2text-2025.4.15-py3-none-any.whl (34 kB)
Installing collected packages: html2text
Successfully installed html2text-2025.4.15


## Content Extraction
We'll use BeautifulSoup to parse the HTML and extract only the main content, excluding things like headers, footers, and ads.\

In [2]:
from bs4 import BeautifulSoup
import requests
from langchain_community.tools import TavilySearchResults
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_transformers import Html2TextTransformer

def extract_main_content(url: str) -> str:
    """Extract the main content from the web page using BeautifulSoup."""
    try:
        response = requests.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Remove unwanted sections (e.g., headers, footers, sidebars, etc.)
        for script in soup(["script", "style", "header", "footer", "nav", "aside"]):
            script.decompose()

        # Extract main body text
        body = soup.find('main') or soup.find('article') or soup.find('div', {'class': 'content'})
        if body:
            return ' '.join([p.get_text() for p in body.find_all('p')])
        else:
            return soup.get_text()  # fallback in case body tag is not found
    except Exception as e:
        print(f"⚠️ Failed to extract content from {url}: {e}")
        return ""


USER_AGENT environment variable not set, consider setting it to identify your requests.


## Summarization
For more intelligent summarization, we can use a pre-trained model from HuggingFace Transformers, which is designed for tasks like summarization.

In [3]:
from transformers import pipeline

# Load a pre-trained summarization model
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def advanced_summarize(text: str, max_length: int = 150) -> str:
    """Summarize the text using a pre-trained model from HuggingFace."""
    summarized = summarizer(text, max_length=max_length, min_length=50, do_sample=False)
    return summarized[0]['summary_text']


  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


## Research Agent:
We will now integrate the content extraction and improved summarization into the research agent.

In [6]:
from typing import List
from pydantic import BaseModel
from langgraph.graph import StateGraph
from langchain_community.tools import TavilySearchResults
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.document_transformers import Html2TextTransformer

# -------------------- State Schema --------------------
class ResearchState(BaseModel):
    topic: str
    docs: List[str] = []
    final_answer: str = ""

# -------------------- Simple Local Summarizer --------------------
def simple_summarize(text: str, max_sentences: int = 5) -> str:
    """Naive summarization: return the first N sentences."""
    import re
    sentences = re.split(r'(?<=[.!?]) +', text)
    return " ".join(sentences[:max_sentences])

# -------------------- Agent 1: Research Agent --------------------
def research_agent(state: ResearchState) -> dict:
    print("🔍 Researching:", state.topic)
    tavily_tool = TavilySearchResults(k=5, tavily_api_key="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")
    search_results = tavily_tool.run(state.topic)

    # Add debugging to inspect search results
    print(f"Search results: {search_results}")
    if not search_results:
        return {"docs": ["No valid content found."]}

    urls = [result['url'] for result in search_results[:3]]
    all_docs = []

    for url in urls:
        try:
            loader = WebBaseLoader(url)
            docs = loader.load()
            text_transformer = Html2TextTransformer()
            plain_docs = text_transformer.transform_documents(docs)
            all_docs.extend([doc.page_content for doc in plain_docs])
        except Exception as e:
            print(f"⚠️ Failed to load {url}: {e}")

    if not all_docs:
        return {"docs": ["No valid content found."]}

    summaries = [simple_summarize(doc) for doc in all_docs]
    return {"docs": summaries}

# -------------------- Agent 2: Drafting Agent --------------------
def drafting_agent(state: ResearchState) -> dict:
    print("📝 Drafting Answer...")
    merged = "\n\n".join(state.docs)
    answer = f"Here's a synthesized summary on the topic '{state.topic}':\n\n{simple_summarize(merged)}"
    return {"final_answer": answer}

# -------------------- LangGraph Workflow --------------------
graph = StateGraph(state_schema=ResearchState)
graph.add_node("research", research_agent)
graph.add_node("draft", drafting_agent)
graph.set_entry_point("research")
graph.add_edge("research", "draft")  # ← this is the fix
graph.set_finish_point("draft")

app = graph.compile()

# -------------------- Run the System --------------------
while True:
    query = input("🔍 Enter a topic to research: ")
    if query.lower() == 'exit':
        print("👋 Exiting. See you next time!")
        break

    # Limit query length if needed
    MAX_QUERY_LENGTH = 512
    if len(query) > MAX_QUERY_LENGTH:
        query = query[:MAX_QUERY_LENGTH]

    try:
        result = app.invoke({"topic": query})
        print("\n✅ Final Answer:\n")
        print(result["final_answer"])
    except Exception as e:
        print(f"⚠️ Error: {e}")


🔍 Researching: What is Machine Learning?
Search results: [{'title': 'Machine learning, explained | MIT Sloan', 'url': 'https://mitsloan.mit.edu/ideas-made-to-matter/machine-learning-explained', 'content': 'What is machine learning?\nMachine learning is a subfield of artificial intelligence, which is broadly defined as the capability of a machine to imitate intelligent human behavior. Artificial intelligence systems are used to perform complex tasks in a way that is similar to how humans solve problems. [...] When companies today deploy artificial intelligence programs, they are most likely using machine learning —\xa0so much so that the terms are often used interchangeably, and sometimes ambiguously. Machine learning is a subfield of artificial intelligence that gives computers the ability to learn without explicitly being programmed. [...] Machine learning is one way to use AI. It was defined in the 1950s by AI pioneer Arthur Samuel as “the field of study that gives computers the abil