In [54]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from mistralai import Mistral

In [55]:
load_dotenv()
API_KEY = os.getenv("MISTRAL_API_KEY")
mistral = Mistral(api_key=API_KEY)

model = "ministral-8b-2410"

In [56]:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [57]:
ed = Website("https://edwarddonner.com")
ed.links

['https://edwarddonner.com/',
 'https://edwarddonner.com/connect-four/',
 'https://edwarddonner.com/outsmart/',
 'https://edwarddonner.com/about-me-and-about-nebula/',
 'https://edwarddonner.com/posts/',
 'https://edwarddonner.com/',
 'https://news.ycombinator.com',
 'https://nebula.io/?utm_source=ed&utm_medium=referral',
 'https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html',
 'https://patents.google.com/patent/US20210049536A1/',
 'https://www.linkedin.com/in/eddonner/',
 'https://edwarddonner.com/2025/09/15/ai-in-production-gen-ai-and-agentic-ai-on-aws-at-scale/',
 'https://edwarddonner.com/2025/09/15/ai-in-production-gen-ai-and-agentic-ai-on-aws-at-scale/',
 'https://edwarddonner.com/2025/05/28/connecting-my-courses-become-an-llm-expert-and-leader/',
 'https://edwarddonner.com/2025/05/28/connecting-my-courses-become-an-llm-expert-and-leader/',
 'https://edwarddonner.com/2025/05/18/2025-ai-executive-briefing/',
 '

In [58]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

In [59]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}



In [60]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [61]:
print(get_links_user_prompt(ed))

Here is the list of links on the website of https://edwarddonner.com - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
https://edwarddonner.com/
https://edwarddonner.com/connect-four/
https://edwarddonner.com/outsmart/
https://edwarddonner.com/about-me-and-about-nebula/
https://edwarddonner.com/posts/
https://edwarddonner.com/
https://news.ycombinator.com
https://nebula.io/?utm_source=ed&utm_medium=referral
https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html
https://patents.google.com/patent/US20210049536A1/
https://www.linkedin.com/in/eddonner/
https://edwarddonner.com/2025/09/15/ai-in-production-gen-ai-and-agentic-ai-on-aws-at-scale/
https://edwarddonner.com/2025/09/15/ai-in-production-gen-ai-and-agentic-ai-on-aws-at-scale/
https://edward

In [62]:
def get_links(url):
    website = Website(url)
    response = mistral.chat.complete(
        model=model,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [63]:
huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/neuphonic/neutts-air',
 '/zai-org/GLM-4.6',
 '/Qwen/Qwen3-VL-30B-A3B-Instruct',
 '/ServiceNow-AI/Apriel-1.5-15b-Thinker',
 '/inclusionAI/Ling-1T',
 '/models',
 '/spaces/enzostvs/deepsite',
 '/spaces/Wan-AI/Wan2.2-Animate',
 '/spaces/zerogpu-aoti/wan2-2-fp8da-aoti-faster',
 '/spaces/neuphonic/neutts-air',
 '/spaces/ibm-granite/Granite-4.0-WebGPU',
 '/spaces',
 '/datasets/Agent-Ark/Toucan-1.5M',
 '/datasets/openai/gdpval',
 '/datasets/fka/awesome-chatgpt-prompts',
 '/datasets/Jr23xd23/ArabicText-Large',
 '/datasets/omniretarget/OmniRetarget_Dataset',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/Writer',
 '/docs/transforme

In [64]:
get_links("https://huggingface.co")

{'links': [{'type': 'about page', 'url': 'https://huggingface.co'},
  {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'company page', 'url': 'https://huggingface.co/enterprise'},
  {'type': 'company page', 'url': 'https://huggingface.co/docs'},
  {'type': 'company page', 'url': 'https://huggingface.co/docs/transformers'},
  {'type': 'company page', 'url': 'https://huggingface.co/docs/diffusers'},
  {'type': 'company page', 'url': 'https://huggingface.co/docs/safetensors'},
  {'type': 'company page',
   'url': 'https://huggingface.co/docs/huggingface_hub'},
  {'type': 'company page', 'url': 'https://huggingface.co/docs/tokenizers'},
  {'type': 'company page', 'url': 'https://huggingface.co/docs/trl'},
  {'type': 'company page',
   'url': 'https://huggingface.co/docs/transformers.js'},
  {'type': 'company page', 'url': 'https://huggingface.co/docs/smolagents'},
  {'type': 'company page', 'url': 'https://huggingface.co/docs/peft'},
  {'type': 'compan

In [65]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    # print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [66]:
print(get_all_details("https://huggingface.co"))

Landing page:
Webpage Title:
Hugging Face – The AI community building the future.
Webpage Contents:
Hugging Face
Models
Datasets
Spaces
Community
Docs
Enterprise
Pricing
Log In
Sign Up
The AI community building the future.
The platform where the machine learning community collaborates on models, datasets, and applications.
Explore AI Apps
or
Browse 1M+ models
Trending on
this week
Models
neuphonic/neutts-air
Updated
about 16 hours ago
•
11.4k
•
398
zai-org/GLM-4.6
Updated
10 days ago
•
24.7k
•
652
Qwen/Qwen3-VL-30B-A3B-Instruct
Updated
1 day ago
•
412k
•
204
ServiceNow-AI/Apriel-1.5-15b-Thinker
Updated
4 days ago
•
9.24k
•
350
inclusionAI/Ling-1T
Updated
1 day ago
•
715
•
180
Browse 1M+ models
Spaces
Running
14.9k
14.9k
DeepSite v3
🐳
Generate any application by Vibe Coding
Running
1.61k
1.61k
Wan2.2 Animate
👁
Wan2.2 Animate
Running
on
Zero
MCP
1.62k
1.62k
Wan2.2 14B Fast
🎥
generate a video from an image with a text prompt
Running
on
Zero
MCP
130
130
NeuTTS-Air
☁
Generate speech from te

In [67]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':

# system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
# and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
# Include details of company culture, customers and careers/jobs if you have the information."

In [68]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [69]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

'You are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title:\nHugging Face – The AI community building the future.\nWebpage Contents:\nHugging Face\nModels\nDatasets\nSpaces\nCommunity\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 1M+ models\nTrending on\nthis week\nModels\nneuphonic/neutts-air\nUpdated\nabout 16 hours ago\n•\n11.4k\n•\n398\nzai-org/GLM-4.6\nUpdated\n10 days ago\n•\n24.7k\n•\n652\nQwen/Qwen3-VL-30B-A3B-Instruct\nUpdated\n1 day ago\n•\n412k\n•\n204\nServiceNow-AI/Apriel-1.5-15b-Thinker\nUpdated\n4 days ago\n•\n9.24k\n•\n350\ninclusionAI/Ling-1T\nUpdated\n1 day ago\n•\n715\n•\n180\nBrowse 1M+ models\nSpaces\nRunning\n14.9k\n14.9k\nDeepSite v3\n

In [70]:
def create_brochure(company_name, url):
    response = mistral.chat.complete(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [71]:
create_brochure("HuggingFace", "https://huggingface.co")

# Hugging Face Brochure

## About Hugging Face

Hugging Face is the AI community building the future. We are a platform where the machine learning community collaborates on models, datasets, and applications. Our mission is to make machine learning accessible to everyone.

## What We Offer

### Models
- **1M+ Models**: Browse and explore a vast library of state-of-the-art AI models.
- **Trending Models**: Discover the latest and most popular models in the community.

### Datasets
- **250k+ Datasets**: Access and share datasets for any machine learning task.
- **Trending Datasets**: Stay updated with the latest datasets in the community.

### Spaces
- **400k+ Applications**: Run and explore a wide range of applications built by the community.
- **Trending Spaces**: Discover the most popular applications in the community.

## Our Community

Hugging Face is home to over 50,000 organizations and a vibrant community of developers, researchers, and enthusiasts. We foster collaboration and innovation through our platform.

## Enterprise Solutions

### Compute
- **Optimized Inference Endpoints**: Deploy models on optimized endpoints.
- **GPU Support**: Update your Spaces applications to a GPU in a few clicks.
- **Pricing**: Starting at $0.60/hour for GPU.

### Team & Enterprise
- **Enterprise-Grade Security**: Advanced security features for your team.
- **Dedicated Support**: Priority support for your enterprise needs.
- **Pricing**: Starting at $20/user/month.

## Open Source

Hugging Face is committed to open source. We provide a suite of tools and libraries to build and deploy machine learning models.

- **Transformers**: State-of-the-art AI models for PyTorch.
- **Diffusers**: State-of-the-art Diffusion models in PyTorch.
- **Tokenizers**: Fast tokenizers optimized for research and production.
- **And more**: Explore our full suite of open-source tools.

## Company Culture

At Hugging Face, we believe in collaboration, innovation, and making machine learning accessible to everyone. Our culture is built on open-source principles, and we value diversity, inclusion, and continuous learning.

## Careers

Join our team and help shape the future of AI. We are always looking for talented individuals to join our mission. Explore our [careers page](https://huggingface.co/jobs) to learn more about our open positions.

## Contact Us

- **Website**: [huggingface.co](https://huggingface.co)
- **GitHub**: [huggingface.co](https://github.com/huggingface)
- **Twitter**: [@huggingface](https://twitter.com/huggingface)
- **LinkedIn**: [huggingface](https://www.linkedin.com/company/huggingface)
- **Discord**: [huggingface](https://discord.gg/huggingface)
- **Zhihu**: [huggingface](https://www.zhihu.com/people/huggingface)
- **WeChat**: [huggingface](https://wechat.com/huggingface)

---

**Hugging Face – The AI community building the future.**

In [72]:
def stream_brochure(company_name, url):
    stream = mistral.chat.stream(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.data.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [73]:
stream_brochure("HuggingFace", "https://huggingface.co")

# Hugging Face Brochure

## About Hugging Face

Hugging Face is the AI community building the future. We are a platform where the machine learning community collaborates on models, datasets, and applications. Our mission is to make machine learning accessible to everyone.

## What We Offer

### Models
- **1M+ Models**: Browse and explore a vast collection of state-of-the-art AI models.
- **Trending Models**: Discover the latest and most popular models in the community.

### Datasets
- **250k+ Datasets**: Access and share datasets for any machine learning task.
- **Trending Datasets**: Stay updated with the latest datasets being used in the community.

### Spaces
- **400k+ Applications**: Explore and run a wide range of applications built by the community.
- **Trending Spaces**: Discover the most popular applications and tools.

## Our Community

Hugging Face is home to over 50,000 organizations and a vibrant community of developers, researchers, and enthusiasts. We support a diverse range of industries, including AI2, Meta, Amazon, Google, Intel, Microsoft, and more.

## Open Source

We are committed to building the foundation of machine learning tooling with the community. Our open-source projects include:

- **Transformers**: State-of-the-art AI models for PyTorch.
- **Diffusers**: State-of-the-art Diffusion models in PyTorch.
- **Safetensors**: Safe way to store/distribute neural network weights.
- **Hub Python Library**: Python client to interact with the Hugging Face Hub.
- **Tokenizers**: Fast tokenizers optimized for research & production.
- **TRL**: Train transformers LMs with reinforcement learning.
- **Transformers.js**: State-of-the-art ML running directly in your browser.
- **Smolagents**: Smol library to build great agents in Python.
- **PEFT**: Parameter-efficient finetuning for large language models.
- **Datasets**: Access & share datasets for any ML tasks.
- **Text Generation Inference**: Serve language models with TGI optimized toolkit.
- **Accelerate**: Train PyTorch models with multi-GPU, TPU, mixed precision.

## Enterprise Solutions

For businesses looking to accelerate their machine learning efforts, we offer:

- **Compute**: Deploy on optimized Inference Endpoints or update your Spaces applications to a GPU in a few clicks.
- **Team & Enterprise**: Give your team the most advanced platform to build AI with enterprise-grade security, access controls, and dedicated support.

## Company Culture

Hugging Face fosters a collaborative and inclusive environment. We believe in the power of open-source and community-driven innovation. Our team is passionate about making machine learning accessible to everyone.

## Careers

Join our team and help shape the future of AI. We are always looking for talented individuals to contribute to our mission. Visit our [Jobs](https://huggingface.co/jobs) page to explore current openings.

## Contact Us

- **Website**: [huggingface.co](https://huggingface.co)
- **GitHub**: [huggingface.co](https://github.com/huggingface)
- **Twitter**: [@huggingface](https://twitter.com/huggingface)
- **LinkedIn**: [huggingface](https://www.linkedin.com/company/huggingface)
- **Discord**: [huggingface](https://discord.gg/huggingface)
- **Zhihu**: [huggingface](https://www.zhihu.com/people/huggingface)
- **WeChat**: [huggingface](https://wechat.com/huggingface)

---

**Hugging Face – The AI community building the future.**