In [1]:
# imports
# If these fail, please check you're running from an 'activated' environment with (llms) in the command prompt

import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [2]:
from openai import OpenAI
MODEL = "llama3.2"
openai = OpenAI(base_url="http://localhost:11434/v1", api_key="ollama")

In [3]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [4]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

In [5]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}



In [7]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [8]:
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [9]:
# Anthropic has made their site harder to scrape, so I'm using HuggingFace..

huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/black-forest-labs/FLUX.1-Kontext-dev',
 '/tencent/Hunyuan-A13B-Instruct',
 '/google/magenta-realtime',
 '/nanonets/Nanonets-OCR-s',
 '/google/gemma-3n-E4B-it',
 '/models',
 '/spaces/enzostvs/deepsite',
 '/spaces/ilcve21/Sparc3D',
 '/spaces/OmniGen2/OmniGen2',
 '/spaces/tencent/Hunyuan3D-2.1',
 '/spaces/black-forest-labs/FLUX.1-Kontext-Dev',
 '/spaces',
 '/datasets/fka/awesome-chatgpt-prompts',
 '/datasets/institutional/institutional-books-1.0',
 '/datasets/EssentialAI/essential-web-v1.0',
 '/datasets/facebook/seamless-interaction',
 '/datasets/FreedomIntelligence/ShareGPT-4o-Image',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammar

In [10]:
get_links("https://huggingface.co")

{'links': [{'type': 'About page', 'url': 'https://huggingface.co'},
  {'type': 'Company/Brand page', 'url': 'https://brand.huggingface.co'},
  {'type': 'Changelog', 'url': 'https://changelog.huggingface.co'},
  {'type': 'Blog', 'url': 'https://blog.huggingface.co'},
  {'type': 'GitHub repository', 'url': 'https://github.com/huggingface'},
  {'type': 'Twitter handle', 'url': 'https://twitter.com/huggingface'},
  {'type': 'LinkedIn company page',
   'url': 'https://www.linkedin.com/company/huggingface/'},
  {'type': 'Contact/press email', 'url': 'mailto:press@huggingface.co'}]}

# Make the brochure

In [11]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [12]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':

# system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
# and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
# Include details of company culture, customers and careers/jobs if you have the information."


In [13]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [14]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'Hugging Face Company Site', 'url': 'https://huggingface.co'}, {'type': 'GitHub Repository', 'url': 'https://github.com/huggingface'}, {'type': 'Twitter Handle', 'url': 'https://twitter.com/huggingface'}, {'type': 'LinkedIn Company Page', 'url': 'https://www.linkedin.com/company/huggingface/'}, {'type': 'About Page', 'url': 'https://about.huggingface.co'}, {'type': 'Blog', 'url': 'https://blog.huggingface.co'}, {'type': 'Discussions Forum', 'url': 'https://discuss.huggingface.co'}, {'type': 'Status Page', 'url': 'https://status.huggingface.co/'}, {'type': 'Changelog', 'url': 'https://changelog.huggingface.co'}, {'type': 'Hugging Face Branding', 'url': 'https://brand.huggingface.co'}]}


ConnectionError: HTTPSConnectionPool(host='about.huggingface.co', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7e56e9c51f30>: Failed to resolve 'about.huggingface.co' ([Errno -5] No address associated with hostname)"))

##### def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [140]:
create_brochure("HuggingFace", "https://huggingface.co")

In [141]:
def stream_brochure(company_name, url):
    stream = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

# Anthropic - Company Brochure

## Company Overview
Here is a concise paragraph summarizing the content in 150 words or less, suitable for a brochure section titled 'Company Overview':

At Claude, we empower teams to work more efficiently and produce high-quality results. Our AI assistant serves as a virtual teammate that taps into shared expertise, streamlining tasks like writing emails and docs. With Claude, you can create with confidence, leveraging features like Projects to ground your knowledge and Artifacts to collaborate on documents and research. We also enable experts to serve others by expanding how each teammate contributes and sparking inspiration through sharing chats across the team. By harnessing our technology, teams are doubling productivity, resolving issues faster, and driving business growth. Our goal is to transform the way you work, enabling each member of your organization to deliver expert-level results.

## Products & Services
Here is a concise, professional paragraph suitable for a brochure section titled 'Products & Services':

Our suite of products offers solutions for productivity, analysis, and collaboration. Start with our Claude Pro plan, priced at $17 per month (with an annual subscription discount), which includes features such as web chat, code execution, and Google Workspace integration. Upgrade to our Team plan for $25 per person per month (with a minimum 5-member team) or Enterprise plan, tailored to businesses operating at scale. For educational institutions, we offer a comprehensive university-wide plan with discounted rates and dedicated API credits. Additionally, our Claude Opus 4 model provides advanced task capabilities, while our Claude Haiku 3.5 offers fast cost-effective solution for web search. Explore our pricing options, including individual, team, enterprise, and education plans, to find the best fit for your needs.

## Technology & Innovation
Information not available.

## Customers & Case Studies
Information not available.

## Culture & Careers
I'm happy to help you with that, but I don't see any content provided for me to summarize. Can you please provide the text you'd like me to condense, and I'll be more than happy to assist you in crafting a concise paragraph suitable for your brochure section titled "Culture & Careers"?

## News & Updates
Here is a concise paragraph summarizing recent news and updates from Anthropic:

At Anthropic, we're dedicated to advancing the field of artificial intelligence with integrity and responsibility. Recently, we've made significant strides in our mission, including raising a Series E funding round worth $61.5B post-money valuation and expanding our offerings with new capabilities, such as Claude 4, which enables researchers to connect AI models with external data. Our Economic Index provides vital insights into the impact of AI on software development, while our Economic Futures Program aims to foster informed decision-making in the face of rapid technological change. Additionally, we've made major strides in securing federal and state approvals for our technology, including a recent approval from the Federal Government's FedRAMP High program. Our commitment to transparency and best practices continues to drive our work at Anthropic.

## Research & Thought Leadership
At Anthropic, our dedicated research teams are leading the charge in developing innovative machine learning systems that prioritize safety, steerability, and reliability. By pushing the boundaries of artificial intelligence, we aim to create large-scale AI systems that can drive positive impact across various fields. Our research efforts focus on advancing the field of AI development, enabling us to design more effective solutions for complex problems. With a commitment to responsible innovation, our teams are at the forefront of exploring new approaches and methodologies that will shape the future of AI and its applications.



In [142]:
stream_brochure("HuggingFace", "https://huggingface.co")

In [None]:
# Try changing the system prompt to the humorous version when you make the Brochure for Hugging Face:

stream_brochure("HuggingFace", "https://huggingface.co")

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


Error fetching https://www.anthropic.com/research-alignments-faking: 404 Client Error: Not Found for url: https://www.anthropic.com/research-alignments-faking
Failed to summarize Company Overview: Connection error.
Failed to summarize Customers & Case Studies: Connection error.
Failed to summarize Culture & Careers: Error code: 500 - {'error': {'message': 'model runner has unexpectedly stopped, this may be due to resource limitations or an internal error, check ollama server logs for details', 'type': 'api_error', 'param': None, 'code': None}}
