In [1]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [2]:
# Initialize and constants

load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')
if api_key and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
MODEL = 'gpt-4o-mini'
openai = OpenAI()

API key looks good so far


In [3]:
# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [4]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
    Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [5]:
def get_relevant_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

# Multi-shot Prompting

In [21]:
link_system_prompt = """You are provided with a list of links found on a webpage. 
You should decide which of the links would be most relevant to include in a brochure about the company, 
such as links to an About page, Company page, or Careers/Jobs pages.

Respond in JSON as shown in the following examples:

Example 1:
{
    "links": [
        {"type": "about page", "url": "https://example.com/about"},
        {"type": "careers page", "url": "https://example.com/careers"},
        {"type": "contact page", "url": "https://example.com/contact"}
    ]
}

Example 2:
{
    "links": [
        {"type": "about page", "url": "https://mycompany.com/about"},
        {"type": "team page", "url": "https://mycompany.com/team"},
        {"type": "blog page", "url": "https://mycompany.com/blog"}
    ]
}
"""

In [22]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_relevant_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

# Making the Brochure

In [23]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."


In [24]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [25]:
def create_brochure(company_name, url):
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
    )
    result = response.choices[0].message.content
    display(Markdown(result))

In [26]:
create_brochure("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'blog page', 'url': 'https://huggingface.co/blog'}, {'type': 'contact page', 'url': 'https://www.linkedin.com/company/huggingface/'}]}


# Hugging Face Brochure

## About Us
**Hugging Face** is at the forefront of the artificial intelligence community, dedicated to building the future through collaboration and innovation. With a mission to democratize machine learning, we provide a platform where researchers, developers, and enterprises can create, discover, and share cutting-edge machine learning models, datasets, and applications.

## Our Offerings
- **Models:** Explore over **1 million state-of-the-art models** across various applications.
- **Datasets:** Access and share **250,000+ datasets** tailored for any machine learning task in the community.
- **Spaces:** Create and host applications seamlessly, with access to running applications like DeepSite and Dia.
- **Enterprise Solutions:** Tailored solutions designed for organizations, offering advanced features and dedicated support.

## Our Customers
More than **50,000 organizations** trust Hugging Face, including some of the world's leading tech companies:
- **Meta**
- **Amazon**
- **Google**
- **Microsoft**
- **Grammarly**

Each of these companies actively participates in building, sharing, and leveraging ML tools through our platform.

## Company Culture
Hugging Face thrives on a dynamic community-focused environment where collaboration is key. Our culture emphasizes:
- **Inclusivity:** Encouraging contributions from diverse perspectives and backgrounds.
- **Innovation:** Constantly striving to improve and expand the boundaries of machine learning.
- **Open Source:** We believe in the power of collective knowledge. Our open-source tools, such as Transformers and Diffusers, are used by thousands to drive ML forward.

## Careers at Hugging Face
We are always on the lookout for passionate individuals eager to make an impact in the field of AI and machine learning. Positions in various roles are available as we expand, from engineers to community managers. By joining our team, you become part of a community dedicated to advancing AI technology and fostering collaboration.

### Current Openings
Visit our careers page for the latest job opportunities and start your journey with us!

## Join Our Community
Whether you are a developer, researcher, or just curious about AI, Hugging Face is the platform for you. Sign up today to unlock the potential of machine learning and be part of this exciting journey!

For more information, visit [Hugging Face](https://huggingface.co) or reach out to us on our social channels!  

--- 

*Hugging Face is committed to building the future of AI—one collaboration at a time.*