In [21]:
import os
import requests
import json
from typing import List
from bs4 import BeautifulSoup
from openai import OpenAI

In [22]:
from IPython.display import Markdown, display, update_display

In [23]:
from openai import OpenAI
MODEL = "llama3.2"
openai = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")

In [24]:
headers = {
    "User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class website:
    url:str
    title:str
    body:str
    links:List[str]
    text:str

    def __init__(self, url):
        self.url = url
        response = requests.get(url)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else 'No Title Found'
        if soup.body:
            for irrelevant in soup.body(['script', 'style', 'img', 'input']):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator = "\n", strip = True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Wbspage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [25]:
ed = website("https://edwarddonner.com/")
print(ed.title)

Home - Edward Donner


In [26]:
print(ed.get_contents())

Wbspage Title:
Home - Edward Donner
Webpage Contents:
Home
Connect Four
Outsmart
An arena that pits LLMs against each other in a battle of diplomacy and deviousness
About
Posts
Well, hi there.
I’m Ed. I like writing code and experimenting with LLMs, and hopefully you’re here because you do too. I also enjoy DJing (but I’m badly out of practice), amateur electronic music production (
very
amateur) and losing myself in
Hacker News
, nodding my head sagely to things I only half understand.
I’m the co-founder and CTO of
Nebula.io
. We’re applying AI to a field where it can make a massive, positive impact: helping people discover their potential and pursue their reason for being. Recruiters use our product today to source, understand, engage and manage talent. I’m previously the founder and CEO of AI startup untapt,
acquired in 2021
.
We work with groundbreaking, proprietary LLMs verticalized for talent, we’ve
patented
our matching model, and our award-winning platform has happy customers a

In [27]:
print(ed.links)

['https://edwarddonner.com/', 'https://edwarddonner.com/connect-four/', 'https://edwarddonner.com/outsmart/', 'https://edwarddonner.com/about-me-and-about-nebula/', 'https://edwarddonner.com/posts/', 'https://edwarddonner.com/', 'https://news.ycombinator.com', 'https://nebula.io/?utm_source=ed&utm_medium=referral', 'https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html', 'https://patents.google.com/patent/US20210049536A1/', 'https://www.linkedin.com/in/eddonner/', 'https://edwarddonner.com/2025/05/28/connecting-my-courses-become-an-llm-expert-and-leader/', 'https://edwarddonner.com/2025/05/28/connecting-my-courses-become-an-llm-expert-and-leader/', 'https://edwarddonner.com/2025/05/18/2025-ai-executive-briefing/', 'https://edwarddonner.com/2025/05/18/2025-ai-executive-briefing/', 'https://edwarddonner.com/2025/04/21/the-complete-agentic-ai-engineering-course/', 'https://edwarddonner.com/2025/04/21/the-complete-agentic

In [46]:
link_system_prompt = """
You are an intelligent assistant trained to help extract relevant links from a company's website for use in a brochure.
Given a list of URLs found on a webpage, you should identify which ones are most useful for understanding the company,
such as:
- About Us / Company Overview
- Team or Leadership Page
- Careers / Jobs
- Products / Solutions
- Customers / Case Studies
- News / Blog
- Contact Information

Your job is to filter out irrelevant links like:
- Terms of Service
- Privacy Policy
- Social media (unless it's official and informative)
- Email links (mailto:)
- External or unrelated links

Respond strictly in JSON with this structure:
{
  "links": [
    {
      "type": "about page",
      "url": "https://example.com/about "
    }
  ]
}
"""

In [47]:
print(link_system_prompt)


You are an intelligent assistant trained to help extract relevant links from a company's website for use in a brochure.
Given a list of URLs found on a webpage, you should identify which ones are most useful for understanding the company,
such as:
- About Us / Company Overview
- Team or Leadership Page
- Careers / Jobs
- Products / Solutions
- Customers / Case Studies
- News / Blog
- Contact Information

Your job is to filter out irrelevant links like:
- Terms of Service
- Privacy Policy
- Social media (unless it's official and informative)
- Email links (mailto:)
- External or unrelated links

Respond strictly in JSON with this structure:
{
  "links": [
    {
      "type": "about page",
      "url": "https://example.com/about "
    }
  ]
}



In [48]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [49]:
print(get_links_user_prompt(ed))

Here is the list of links on the website of https://edwarddonner.com/ - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.
https://edwarddonner.com/
https://edwarddonner.com/connect-four/
https://edwarddonner.com/outsmart/
https://edwarddonner.com/about-me-and-about-nebula/
https://edwarddonner.com/posts/
https://edwarddonner.com/
https://news.ycombinator.com
https://nebula.io/?utm_source=ed&utm_medium=referral
https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html
https://patents.google.com/patent/US20210049536A1/
https://www.linkedin.com/in/eddonner/
https://edwarddonner.com/2025/05/28/connecting-my-courses-become-an-llm-expert-and-leader/
https://edwarddonner.com/2025/05/28/connecting-my-courses-become-an-llm-expert-and-leader/
https://edwarddonner.com/2025/05/18/2025-ai-executive

In [54]:
def get_links(url):
    web = website(url)
    responses = openai.chat.completions.create(
        model = MODEL,
        messages = [
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(web)}
        ],
        response_format = {"type": "json_object"}
    )
    result = responses.choices[0].message.content
    return json.loads(result)

In [51]:
get_links("https://www.anthropic.com/")

{'links': [{'type': 'company overview', 'url': 'https://www.anthropic.com/'},
  {'type': "about the founderClaude's page",
   'url': 'https://www.anthropic.com/claude'},
  {'type': 'team leadership', 'url': 'https://www.anthropic.com/team'},
  {'type': 'products solutions',
   'url': 'https://www.anthropic.com/solutions/agents'},
  {'type': 'products solutions',
   'url': 'https://www.anthropic.com/solutions/coding'},
  {'type': 'products solutions',
   'url': 'https://www.anthropic.com/solutions/customer-support'},
  {'type': 'research and economic index',
   'url': 'https://www.anthropic.com/research'},
  {'type': 'economic index',
   'url': 'https://www.anthropic.com/economic-index'},
  {'type': 'press and transparency',
   'url': 'https://www.anthropic.com/transparency'},
  {'type': 'transparency',
   'url': 'https://www.anthropic.com/responsible-scaling-policy'},
  {'type': 'customer care',
   'url': 'https://www.anthropic.com/solutions/customer-support'},
  {'type': 'engineering 

# Make the brochure

In [55]:
def get_all_details(url):
    result = "Landing Page: \n"
    result += website(url).get_contents()
    links = get_links(url)
    print("found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += website(link["url"]).get_contents()
    return result

In [56]:
get_all_details("https://www.anthropic.com/")

found links: {'links': [{'type': 'about page', 'url': 'https://www.anthropic.com/'}, {'type': 'team leadership page', 'url': 'https://www.anthropic.com/team'}, {'type': 'careers jobs page', 'url': 'https://www.anthropic.com/careers'}, {'type': 'products solutions page', 'url': 'https://www.anthropic.com/solutions/agents'}, {'type': 'news blog page', 'url': 'https://www.anthropic.com/news/'}, {'type': 'customers case studies page', 'url': 'https://www.anthropic.com/customers'}, {'type': 'products solutions page', 'url': 'https://www.anthropic.com/solutions/coding'}, {'type': 'company overview page', 'url': 'https://www.anthropic.com/company'}, {'type': 'research and developments page', 'url': 'https://www.anthropic.com/research'}, {'type': 'engineering page', 'url': 'https://www.anthropic.com/engineering'}]}


'Landing Page: \nWbspage Title:\nHome \\ Anthropic\nWebpage Contents:\nSkip to main content\nSkip to footer\nClaude\nChat with Claude\nOverview\nTeam plan\nEnterprise plan\nEducation plan\nExplore pricing\nDownload apps\nClaude log in\nNews\nClaude’s character\nAPI\nBuild with Claude\nAPI\xa0overview\nDeveloper docs\nExplore pricing\nConsole log in\nNews\nLearn how to build with Claude\nSolutions\nCollaborate with Claude\nAI\xa0agents\nCoding\nCustomer support\nCase studies\nHear from our customers\nResearch\nResearch\nOverview\nEconomic Index\nClaude model family\nClaude Opus 4\nClaude Sonnet 4\nClaude Haiku 3.5\nResearch\nClaude’s extended thinking\nCommitments\nInitiatives\nTransparency\nResponsible scaling policy\nTrust center\nSecurity and compliance\nAnnouncement\nISO\xa042001 certification\nLearn\nLearning resources\nCustomer stories\nEngineering at Anthropic\nAnthropic Academy\nCompany\nAbout\nCareers\nEvents\nEngineering\nBuilding effective agents\nNews\nEN\nThis is some text 

In [57]:
system_prompt = """
You are an AI assistant that reads and analyzes content from a company's website and generates a short, professional brochure in Markdown format. 

Target audience:
- Prospective customers
- Investors
- Potential employees

Include these sections if possible:
1. Company Overview
2. Products & Services
3. Customers & Case Studies
4. Technology & Innovation
5. Culture & Careers
6. Why Choose Us

Use only factual information from the provided content. Do not make up details.
"""

In [58]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"you are looking at a company called: {company_name} \n"
    user_prompt += f"Here are the contents of its landing page and other relavant pages; use this inofrmation to build a short brochure of "
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:20000]
    return user_prompt

In [59]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model = MODEL,
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ]
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [60]:
create_brochure("Anthropic", "https://www.anthropic.com/")

found links: {'links': [{'type': 'about page', 'url': 'https://www.anthropic.comcompany'}, {'type': 'team or leadership page', 'url': 'https://www.anthropic.com/team'}, {'type': 'careers / jobs', 'url': 'https://www.anthropic.com/careers'}, {'type': 'products / solutions', 'url': 'https://www.anthropic.com/solutions/agents'}, {'type': 'customers / case studies', 'url': 'https://www.anthropic.com/customers'}, {'type': 'news / blog', 'url': 'https://www.anthropic.com/news'}]}


ConnectionError: HTTPSConnectionPool(host='www.anthropic.comcompany', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x75b56298fd00>: Failed to resolve 'www.anthropic.comcompany' ([Errno -2] Name or service not known)"))