In [1]:
import os 
import re
import requests
import socket
from requests.exceptions import ConnectionError
from requests.exceptions import MissingSchema
from requests.exceptions import InvalidSchema
from urllib3.exceptions import MaxRetryError, NameResolutionError
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display 
import ollama

In [2]:
load_dotenv(override=True)
MODEL = 'llama3.2'

In [3]:
headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
    (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    def __init__(self,url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title=soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script","style","img","input"]):
                irrelevant.decompose()
            self.text=soup.body.get_text(separator="\n", strip=True)
        else:
            self.text=""
        links =[link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title: \n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [4]:
page = Website("https://huggingface.co")
page.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/posts',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/nvidia/parakeet-tdt-0.6b-v2',
 '/Wan-AI/Wan2.1-VACE-14B',
 '/nari-labs/Dia-1.6B',
 '/multimodalart/isometric-skeumorphic-3d-bnb',
 '/lodestones/Chroma',
 '/models',
 '/spaces/enzostvs/deepsite',
 '/spaces/Lightricks/ltx-video-distilled',
 '/spaces/smolagents/computer-agent',
 '/spaces/ByteDance/DreamO',
 '/spaces/NihalGazi/FLUX-Pro-Unlimited',
 '/spaces',
 '/datasets/openbmb/Ultra-FineWeb',
 '/datasets/PrimeIntellect/INTELLECT-2-RL-Dataset',
 '/datasets/nvidia/OpenCodeReasoning',
 '/datasets/nvidia/OpenMathReasoning',
 '/datasets/DMindAI/DMind_Benchmark',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google',
 '/Intel',
 '/microsoft',
 '/grammarly',
 '/Writer',


In [5]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in \
a brochure about the company, such as links to an About page, or a Company page, \
or Careers/Jobs pages. \n"
link_system_prompt += "You should respond only in JSON, without text, object as in this example:"
link_system_prompt += """
{
    "links":[
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}
"""

print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages. 
You should respond only in JSON, without text, object as in this example:
{
    "links":[
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
}



In [6]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} -"
    user_prompt += "please decide which of these are relevant web links for \
    a brochure about the company, respond with the full https URL in clean JSON format \
    wihout text json on the beginning of the response. \
    Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

print(get_links_user_prompt(page))

Here is the list of links on the website of https://huggingface.co -please decide which of these are relevant web links for     a brochure about the company, respond with the full https URL in clean JSON format     wihout text json on the beginning of the response.     Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
/
/models
/datasets
/spaces
/posts
/docs
/enterprise
/pricing
/login
/join
/spaces
/models
/nvidia/parakeet-tdt-0.6b-v2
/Wan-AI/Wan2.1-VACE-14B
/nari-labs/Dia-1.6B
/multimodalart/isometric-skeumorphic-3d-bnb
/lodestones/Chroma
/models
/spaces/enzostvs/deepsite
/spaces/Lightricks/ltx-video-distilled
/spaces/smolagents/computer-agent
/spaces/ByteDance/DreamO
/spaces/NihalGazi/FLUX-Pro-Unlimited
/spaces
/datasets/openbmb/Ultra-FineWeb
/datasets/PrimeIntellect/INTELLECT-2-RL-Dataset
/datasets/nvidia/OpenCodeReasoning
/datasets/nvidia/OpenMathReasoning
/datasets/DMindAI/DMind_Benchmark
/datasets
/join
/pricing#endpoints
/pricing#space

In [8]:
def get_links(url):
    website = Website(url)
    response = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
        ],
        options={"format": "json"}
    )
    result = response['message']['content']

    print(result)
    try:
        content_json = json.loads(result)
        return content_json
    except json.JSONDecodeError:
        print("Odpowiedź nie jest poprawnym JSON")

huggingface = Website("https://huggingface.co")
huggingface.links
get_links("https://huggingface.co")

{
    "links":[
        {"type": "About page", "url": "https://huggingface.co/"},
        {"type": "Company page", "url": "https://huggingface.co/brand"},
        {"type": "Careers/Jobs page", "url": "https://apply.workable.com/huggingface/"},
        {"type": "Documentation page", "url": "https://docs.huggingface.co/"},
        {"type": "Blog page", "url": "https://blog.huggingface.co/"},
        {"type": "Community page", "url": "https://discuss.huggingface.co/"}
    ]
}


{'links': [{'type': 'About page', 'url': 'https://huggingface.co/'},
  {'type': 'Company page', 'url': 'https://huggingface.co/brand'},
  {'type': 'Careers/Jobs page',
   'url': 'https://apply.workable.com/huggingface/'},
  {'type': 'Documentation page', 'url': 'https://docs.huggingface.co/'},
  {'type': 'Blog page', 'url': 'https://blog.huggingface.co/'},
  {'type': 'Community page', 'url': 'https://discuss.huggingface.co/'}]}

In [16]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    
    links = get_links(url)
    if links and "links" in links:
        for link in links["links"]:
            try:
                result += f"\n\nLink:\n{link}\n"
                result += Website(link).get_contents()
            except socket.gaierror as e:
                print(f"DNS resolution failed: {e}")
            except NameResolutionError as e:
                print(f"Name resolution error: {e}")
            except MaxRetryError as e:
                print(f"Max retries exceeded: {e}")
            except ConnectionError as e:
                print(f"Connection error: {e}")
            except MissingSchema as e:
                print(f"Invalid URL schema: {e}")
            except InvalidSchema as e:
                print(f"Omitted unsupported URL (InvalidSchema): {e}")
    else:
        print("No links found or failed to retrieve links.")
    
    return result


print(get_all_details("https://huggingface.co"))

{
    "links": [
        {"type": "About page", "url": "https://huggingface.co/"},
        {"type": "Company page", "url": "https://huggingface.co/brand"},
        {"type": "Careers/Jobs page", "url": "https://apply.workable.com/huggingface/"},
        {"type": "Blog", "url": "https://blog.huggingface.co/"},
        {"type": "Discussions", "url": "https://discuss.huggingface.co/"},
        {"type": "GitHub page", "url": "https://github.com/huggingface"},
        {"type": "Twitter page", "url": "https://twitter.com/huggingface"},
        {"type": "LinkedIn page", "url": "https://www.linkedin.com/company/huggingface/"}
    ]
}
Omitted unsupported URL (InvalidSchema): No connection adapters were found for "{'type': 'About page', 'url': 'https://huggingface.co/'}"
Omitted unsupported URL (InvalidSchema): No connection adapters were found for "{'type': 'Company page', 'url': 'https://huggingface.co/brand'}"
Omitted unsupported URL (InvalidSchema): No connection adapters were found for "{'ty

In [17]:
system_prompt = "You are an assistant that analyzes the contents of sevveral relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown. \
Include details of company culture, customers and careers/jobs if you have the information."

In [19]:
def get_brochure_user_prompt(company_name, url):
    user_prompt =f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; \
    use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    userp_prompt = user_prompt[:5_000]
    return user_prompt

get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

{
    "links": [
        {
            "type": "about page",
            "url": "https://huggingface.co/"
        },
        {
            "type": "blog",
            "url": "https://discuss.huggingface.co"
        },
        {
            "type": "github",
            "url": "https://github.com/huggingface"
        },
        {
            "type": "twitter",
            "url": "https://twitter.com/huggingface"
        },
        {
            "type": "linkedin",
            "url": "https://www.linkedin.com/company/huggingface/"
        },
        {
            "type": "pricing page",
            "url": "https://endpoints.huggingface.co/pricing#endpoints"
        }
    ]
}
Omitted unsupported URL (InvalidSchema): No connection adapters were found for "{'type': 'about page', 'url': 'https://huggingface.co/'}"
Omitted unsupported URL (InvalidSchema): No connection adapters were found for "{'type': 'blog', 'url': 'https://discuss.huggingface.co'}"
Omitted unsupported URL (InvalidSchema): 

"You are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages;     use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title: \nHugging Face – The AI community building the future.\nWebpage Contents:\nHugging Face\nModels\nDatasets\nSpaces\nPosts\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 1M+ models\nTrending on\nthis week\nModels\nnvidia/parakeet-tdt-0.6b-v2\nUpdated\n4 days ago\n•\n56.6k\n•\n984\nWan-AI/Wan2.1-VACE-14B\nUpdated\n1 day ago\n•\n8.8k\n•\n201\nnari-labs/Dia-1.6B\nUpdated\n6 days ago\n•\n143k\n•\n2.27k\nmultimodalart/isometric-skeumorphic-3d-bnb\nUpdated\n5 days ago\n•\n550\n•\n196\nlodestones/Chroma\nUpdated\n2 days ago\n•\n620\nBrowse 1M+ models\nSpaces\nRunning\n6.8k\n6.8k\nDeepSite\n🐳\nGenerate any

In [21]:
def create_brochure(company_name, url):
    response = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system","content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ],
    )
    result = response['message']['content']
    display(Markdown(result))

create_brochure("HuggingFace", "https://huggingface.co")

{
  "links": [
    "https://huggingface.co/",
    "https://endpoints.huggingface.co",
    "https://discuss.huggingface.co",
    "https://status.huggingface.co/",
    "https://github.com/huggingface"
  ],
  "spaces": [
    "https://spaces.huggingface.co/",
    "https://huggingface.co/spaces/enzostvs/deepsite",
    "https://huggingface.co/spaces/Lightricks/ltx-video-distilled",
    "https://huggingface.co/spaces/smolagents/computer-agent"
  ],
  "datasets": [
    "https://huggingface.co/datasets/",
    "https://huggingface.co/datasets/openbmb/Ultra-FineWeb",
    "https://huggingface.co/datasets/PrimeIntellect/INTELLECT-2-RL-Dataset",
    "https://huggingface.co/datasets/nvidia/OpenCodeReasoning",
    "https://huggingface.co/datasets/nvidia/OpenMathReasoning",
    "https://huggingface.co/datasets/DMindAI/DMind_Benchmark"
  ],
  "docs": [
    "https://huggingface.co/docs/",
    "https://huggingface.co/docs/transformers",
    "https://huggingface.co/docs/diffusers",
    "https://huggingface

The provided links are related to the Hugging Face website and its various components. Here's a summary of what each link appears to be:

1. **Hugging Face status**: This page provides updates on the current status of Hugging Face services, including maintenance, incidents, and downtime.
2. **GitHub profile for Hugging Face**: This is the official GitHub profile for Hugging Face, showcasing their repositories, projects, packages, and people.
3. **Repositories**: The link takes you to a list of all public repositories hosted by Hugging Face, which include popular libraries like Transformers, Diffusers, and Optimum.
4. **People page**: This page lists the top contributors to Hugging Face's repositories, including authors, maintainers, and collaborators.
5. **Packages**: This section provides information on available packages and their dependencies for use with Hugging Face's libraries.

In summary, these links appear to be related to:

* Status updates on Hugging Face services
* The official GitHub profile for Hugging Face
* A list of public repositories hosted by Hugging Face
* Information on contributors to Hugging Face's repositories
* Available packages and their dependencies

The provided information seems to be a snapshot of the current state of Hugging Face's website and its components, providing users with an overview of what's available and what's currently up-to-date.

In [None]:
def stream_brochure(company_name, url):
    stream = ollama.chat(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name,url)}
        ],
        stream=True
    )
    result =""
    display_handle = display(Markdown(''),display_id=True)
    for chunk in stream:
        result += chunk['message']['content'] or''

        result = result.replace("```","").replace("markdown","")
        update_display(Markdown(result), display_id = display_handle.display_id)

stream_brochure("HuggingFace", "https://huggingface.co")

{
    "links":[
        {"type": "About page", "url": "https://huggingface.co/team"},
        {"type": "Company page", "url": "https://huggingface.co/brand"},
        {"type": "Blog", "url": "https://blog.huggingface.co"},
        {"type": "Documentation", "url": "https://docs.huggingface.co"},
        {"type": "GitHub", "url": "https://github.com/huggingface"},
        {"type": "Twitter", "url": "https://twitter.com/huggingface"},
        {"type": "LinkedIn", "url": "https://www.linkedin.com/company/huggingface/"}
    ]
}
Omitted unsupported URL (InvalidSchema): No connection adapters were found for "{'type': 'About page', 'url': 'https://huggingface.co/team'}"
Omitted unsupported URL (InvalidSchema): No connection adapters were found for "{'type': 'Company page', 'url': 'https://huggingface.co/brand'}"
Omitted unsupported URL (InvalidSchema): No connection adapters were found for "{'type': 'Blog', 'url': 'https://blog.huggingface.co'}"
Omitted unsupported URL (InvalidSchema): No conn

