In [1]:
import os
import requests
import json
from typing import List
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from dotenv import load_dotenv
from openai import OpenAI

In [3]:
# Initialize and constants

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')
if api_key and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
MODEL = 'gpt-4o-mini'
openai = OpenAI()

API key looks good so far


In [4]:
# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [5]:
ed = Website("https://edwarddonner.com")
ed.links

['https://edwarddonner.com/',
 'https://edwarddonner.com/connect-four/',
 'https://edwarddonner.com/outsmart/',
 'https://edwarddonner.com/about-me-and-about-nebula/',
 'https://edwarddonner.com/posts/',
 'https://edwarddonner.com/',
 'https://news.ycombinator.com',
 'https://nebula.io/?utm_source=ed&utm_medium=referral',
 'https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html',
 'https://patents.google.com/patent/US20210049536A1/',
 'https://www.linkedin.com/in/eddonner/',
 'https://edwarddonner.com/2025/04/21/the-complete-agentic-ai-engineering-course/',
 'https://edwarddonner.com/2025/04/21/the-complete-agentic-ai-engineering-course/',
 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',
 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/12/21/llm-

In [11]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
    Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [17]:
def get_relevant_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

## One Shot Prompting
The system prompt has one example of the json in which we are expecting our output for links to be. Hence we are using one shot prompting here.

In [9]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

# Actual Link Scrapping After Setup

In [16]:
# Scraping links from huggingface
huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/posts',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/deepseek-ai/DeepSeek-Prover-V2-671B',
 '/nvidia/parakeet-tdt-0.6b-v2',
 '/nari-labs/Dia-1.6B',
 '/JetBrains/Mellum-4b-base',
 '/Qwen/Qwen3-235B-A22B',
 '/models',
 '/spaces/enzostvs/deepsite',
 '/spaces/RiverZ/ICEdit',
 '/spaces/nari-labs/Dia-1.6B',
 '/spaces/Qwen/Qwen3-Demo',
 '/spaces/retwpay/waiNSFWIllustrious_v110',
 '/spaces',
 '/datasets/nvidia/Nemotron-CrossThink',
 '/datasets/rajpurkarlab/ReXGradient-160K',
 '/datasets/nvidia/OpenMathReasoning',
 '/datasets/nvidia/OpenCodeReasoning',
 '/datasets/deepseek-ai/DeepSeek-ProverBench',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/Writer',
 '/docs/transforme

In [18]:
# Asking the llm to give us relevant links
get_relevant_links("https://huggingface.co")

{'links': [{'type': 'about page', 'url': 'https://huggingface.co'},
  {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'blog', 'url': 'https://huggingface.co/blog'},
  {'type': 'community page', 'url': 'https://discuss.huggingface.co'},
  {'type': 'company profile',
   'url': 'https://www.linkedin.com/company/huggingface/'},
  {'type': 'contact page', 'url': 'https://huggingface.co/chat'}]}

In [19]:
# Function to script details of relevant links
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

# Making the Brochure

In [20]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."


In [21]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [22]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [23]:
create_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'company page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


# Hugging Face Brochure

## About Us
Welcome to Hugging Face, the AI community shaping the future of machine learning. Our platform serves as a collaborative hub where enthusiasts, developers, and organizations can come together to build, share, and innovate on AI models, datasets, and applications.

Discover a wealth of resources including **1M+ models** and **250k+ datasets** that fuel the creativity and development within the machine learning community.

## Our Offerings
### AI Models & Datasets
- **Explore Models**: With over 1 million models available, you can browse and utilize state-of-the-art machine learning resources for various tasks, from text generation to image processing.
- **Utilize Datasets**: Access an extensive library of datasets tailored for different machine learning tasks, enabling comprehensive research and application development.

### Compute & Enterprise Solutions
- **Compute**: Deploy your applications on optimized inference endpoints with GPU solutions starting at just $0.60/hour.
- **Enterprise**: Unlock the full potential of AI with our enterprise-grade solutions featuring advanced security, dedicated support, and tailored access for teams, starting at $20/user/month.

## Who We Serve
More than **50,000 organizations** trust Hugging Face including industry giants like:
- **Google**
- **Amazon**
- **Microsoft**
- **Meta**
- **Grammarly**

These partners leverage Hugging Face for cutting-edge AI advancements and collaborative projects.

## Company Culture
At Hugging Face, we pride ourselves on being community-driven. We’re passionate about open-source contributions and fostering collaboration among a diverse group of talent. Our inclusive environment encourages continuous learning and experimentation, making us a leading force in the AI landscape.

## Careers at Hugging Face
Join us in our mission to democratize AI! We are on the lookout for innovative and driven individuals to be part of our growing team. Check out available positions in various fields including engineering, research, and community management. 

Explore exciting career opportunities and help us build the future of artificial intelligence!

## Connect With Us
- **Website**: [huggingface.co](https://huggingface.co)
- **GitHub**: [Hugging Face GitHub](https://github.com/huggingface)
- **Twitter**: [@HuggingFace](https://twitter.com/huggingface)
- **LinkedIn**: [Hugging Face LinkedIn](https://linkedin.com/company/huggingface)
- **Discord**: Join our community discussions.

---
Join Hugging Face today and be part of a revolutionary platform that's not just about technology, but about building a community that empowers every AI enthusiast and expert!

# Converting the Brochure to a Stream Like Result

In [24]:
def stream_brochure(company_name, url):
    stream = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [26]:
stream_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'company blog', 'url': 'https://huggingface.co/blog'}, {'type': 'community discussion', 'url': 'https://discuss.huggingface.co'}, {'type': 'GitHub repository', 'url': 'https://github.com/huggingface'}, {'type': 'LinkedIn page', 'url': 'https://www.linkedin.com/company/huggingface/'}, {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'}]}



# Welcome to Hugging Face

### The AI Community Building the Future

Hugging Face is a thriving community dedicated to transforming the field of Artificial Intelligence (AI) and Machine Learning (ML). As a collaborative platform, we foster partnerships among enthusiasts, researchers, developers, and enterprises, uniting them to accelerate innovation and establish state-of-the-art models, datasets, and applications.

---

## Our Offerings

### Models & Datasets
- **Massive Repository**: Access over **1 million ML models** and **250,000 datasets** creating a vast resource for research and development.
- **Cutting-Edge Technology**: Use our sophisticated libraries such as **Transformers**, **Diffusers**, and **Tokenizers** tailored for developers and researchers alike.

### Collaboration Spaces
- **Hugging Spaces**: Host and collaborate on unlimited public models and applications, making it easier for teams to collaborate and innovate.

### Enterprise Solutions
Our comprehensive **Enterprise offerings** include:
- Custom solutions with **enterprise-grade security**, priority support, and dedicated access controls starting at **$20/user/month**.
- Deployment capabilities on optimized inference endpoints providing performance and scalability.

### Support and Community
Join a community of over **50,000 organizations** from diverse sectors including **Meta**, **Google**, **Microsoft**, and many others, all leveraging Hugging Face to advance their AI projects.

---

## Company Culture

At Hugging Face, we believe in the power of open-source collaboration. Our mission to democratize machine learning emphasizes transparency and shared knowledge. Our team, composed of over **215 members**, fosters a culture of inclusivity, creativity, and continuous learning.

We understand that collaboration is key to innovation, and we actively encourage our members to share their work, contribute to existing projects, and create new cutting-edge solutions to drive the AI field forward.

---

## Careers at Hugging Face

We are always on the lookout for bright minds to join our mission. If you are passionate about AI and ML and excited to shape the future of technology, **Hugging Face** is the place for you.

- Explore job openings across various roles in engineering, research, and support.
- Become part of a collaborative environment where your contributions will make a significant impact.

### Join Us!
Feel inspired? [Join the Hugging Face community today](https://huggingface.co) and help us in our quest to democratize machine learning and create impactful solutions for all.

---

**Connect With Us:**
- [GitHub](https://huggingface.co)
- [Twitter](https://twitter.com/huggingface)
- [LinkedIn](https://www.linkedin.com/company/huggingface)
- [Discord](https://discord.gg/huggingface)

Together, let's build the future of AI!

