In [1]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [2]:
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
MODEL = 'gpt-4o-mini'
openai = OpenAI()

API key looks good so far


### Web Scraping

In [3]:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

### Type of Prompts

#### Extract the internal website links 

In [4]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

In [5]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [6]:
def get_links(url):
    website = Website(url)
    response = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
      ],
        response_format={"type": "json_object"}
    )
    result = response.choices[0].message.content
    return json.loads(result)

In [7]:
get_links("https://huggingface.co")

{'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'},
  {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'company page',
   'url': 'https://www.linkedin.com/company/huggingface/'}]}

#### Extract data from the external links and make the brouche

In [8]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [9]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

In [10]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [11]:
def stream_brochure(company_name, url):
    stream = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

#### Convert final brochue into spanish

In [33]:
system_prompt_spansih = "You are an assistant that convert brochue data into spanish language. Respond in markdown"

In [34]:
def get_brochure_user_prompt_spanish(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the brochue for the company; use this information to build a spanish version of the brochure of the company in markdown.\n"
    user_prompt += f"Company Brochue: {stream_brochure(company_name, url)}\n"
    # user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [35]:
def stream_brochure_spanish(company_name, url):
    stream = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt_spansih},
            {"role": "user", "content": get_brochure_user_prompt_spanish(company_name, url)}
          ],
        stream=True
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [36]:
stream_brochure_spanish("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/huggingface'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'blog', 'url': 'https://huggingface.co/blog'}, {'type': 'community forum', 'url': 'https://discuss.huggingface.co'}, {'type': 'GitHub page', 'url': 'https://github.com/huggingface'}, {'type': 'LinkedIn page', 'url': 'https://www.linkedin.com/company/huggingface/'}, {'type': 'Twitter page', 'url': 'https://twitter.com/huggingface'}]}



# Hugging Face Brochure

## Welcome to Hugging Face

Hugging Face is a vibrant community at the forefront of artificial intelligence, dedicated to building the future of machine learning. Our platform is where innovators come together to collaborate on models, datasets, applications, and the latest in AI technology.

---

## What We Do

### Collaborate and Innovate
- **Explore AI Models:** Browse our extensive library of **1M+ models**, including cutting-edge contributions from leaders in the field such as Microsoft, AI at Meta, and Google.
- **Datasets:** Access **250k+ datasets** tailored for various ML tasks, promoting a philosophy of open collaboration.
- **Create and Deploy:** Utilize our **Spaces** to create applications and showcase your work, with easy deployment on **optimized GPU endpoints**.
  
---

## Our Community

Join a thriving community of **over 50,000 organizations** worldwide, including major enterprises like Amazon, Intel, and Grammarly. Our community is fueled by collaboration, shared knowledge, and an open-source ethos that empowers everyone to contribute.

### Recent Trends
Check out the trending models and applications on our platform, from video generation to synthetic data generation, all powered by state-of-the-art AI technology.

---

## Company Culture

At Hugging Face, we believe in fostering an inclusive and innovative company culture where every voice is valued. Our team is passionate about advancing technology while prioritizing ethics and safety in AI development. We promote an environment of continuous learning and encourage our members to share their ideas and projects, embodying the collaborative spirit of our community.

---

## Careers at Hugging Face

Join us in shaping the future of AI! We are always on the lookout for talented individuals from diverse backgrounds who are eager to contribute to this cutting-edge field. 
- **Work Perks:** Flexible working hours, opportunities for professional development, and a community-focused environment.
- **Current Openings:** Visit our [Careers page](https://huggingface.co/jobs) to explore available positions.

---

## Get Started

Ready to dive into the world of AI? 
- Sign up for free and start creating, discovering, and collaborating today!
- For enterprise solutions, get in touch with us to learn more about dedicated support and resources tailored for your organization.

---

## Connect With Us

Stay updated with the latest news, and advancements in AI:
- Follow us on **[Twitter](https://twitter.com/huggingface)**, **[LinkedIn](https://linkedin.com/company/huggingface)**, and **[Discord](https://discord.gg/huggingface)**.

Let’s build the future of machine learning together!

---

This brochure encapsulates the mission, offerings, community, culture, career opportunities, and connection options available at Hugging Face, designed for prospective customers, investors, and recruits.

# HuggingFace

## Acerca de Nosotros

HuggingFace es una empresa líder en el desarrollo de inteligencia artificial y procesamiento de lenguaje natural (NLP). Nos dedicamos a construir herramientas y modelos que hacen que la IA sea accesible para todos.

## Nuestra Misión

Nuestra misión es democratizar la inteligencia artificial. Creemos que todos deberían poder utilizar y contribuir a la revolución de la IA. Trabajamos para crear un entorno donde los investigadores y desarrolladores puedan compartir y colaborar en proyectos innovadores.

## Nuestros Productos

### Transformers
Los Transformers son nuestros modelos más avanzados en el campo del NLP. Con más de 100,000 modelos pre-entrenados, tenemos una solución para cada necesidad.

### Datasets
Proporcionamos acceso a una amplia gama de conjuntos de datos, que facilitan la creación y entrenamiento de modelos de IA de alto rendimiento.

### Harrington
Nuestra herramienta de desarrollo de IA permite a las empresas construir aplicaciones personalizadas de IA de manera rápida y eficiente.

## Comunidad

En HuggingFace fomentamos una comunidad activa donde los desarrolladores, investigadores y entusiastas pueden intercambiar ideas, colaborar y aprender unos de otros. Ofrecemos múltiples recursos, tutoriales y foros para facilitar el aprendizaje.

## Conectar con Nosotros

¡Únete a la revolución de la IA con HuggingFace! Conéctate con nosotros en:

- [Twitter](https://twitter.com/huggingface)
- [GitHub](https://github.com/huggingface)
- [Website](https://huggingface.co)

---

¡Descubre cómo HuggingFace puede ayudarte a transformar tus ideas en realidad!